summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
committertrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
commit68569dee1416593955c1570d638b3d9250b33012 (patch)
treed960f017cd7eba3f125b7e8a813789ee2e076310 /third_party/aom/av1/common/x86
parent07c17b6b98ed32fcecff15c083ab0fd878de3cf0 (diff)
downloadUXP-68569dee1416593955c1570d638b3d9250b33012.tar
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.gz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.lz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.xz
UXP-68569dee1416593955c1570d638b3d9250b33012.zip
Import aom library
This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36.
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_ssse3.c1029
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c839
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c81
-rw-r--r--third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c533
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm1d_sse4.h144
-rw-r--r--third_party/aom/av1/common/x86/filterintra_sse4.c898
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c557
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c1398
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h92
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c286
-rw-r--r--third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c507
-rw-r--r--third_party/aom/av1/common/x86/idct_intrin_sse2.c1402
-rw-r--r--third_party/aom/av1/common/x86/pvq_sse4.c252
-rw-r--r--third_party/aom/av1/common/x86/pvq_sse4.h13
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c1805
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse2.c297
16 files changed, 10133 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
new file mode 100644
index 000000000..91102bbaf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#define WIDTH_BOUND (16)
+#define HEIGHT_BOUND (16)
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int8_t,
+ sub_pel_filters_12sharp_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+ sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
+#endif // CONFIG_DUAL_FILTER
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int8_t,
+ sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+ sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
+#endif
+
+typedef int8_t (*SubpelFilterCoeffs)[16];
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+ if (p.interp_filter == MULTITAP_SHARP) {
+ return &sub_pel_filters_12sharp_signal_dir[index][0];
+ }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+ if (p.interp_filter == TEMPORALFILTER_12TAP) {
+ return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
+ }
+#endif
+ (void)p;
+ (void)index;
+ return NULL;
+}
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+ if (p.interp_filter == MULTITAP_SHARP) {
+ return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
+ }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+ if (p.interp_filter == TEMPORALFILTER_12TAP) {
+ return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
+ }
+#endif
+ (void)p;
+ (void)index;
+ return NULL;
+}
+
+static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
+ __m128i t0, t1;
+
+ t0 = _mm_unpacklo_epi16(in[0], in[1]);
+ t1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ out[0] = _mm_unpacklo_epi32(t0, t1);
+ out[1] = _mm_srli_si128(out[0], 8);
+ out[2] = _mm_unpackhi_epi32(t0, t1);
+ out[3] = _mm_srli_si128(out[2], 8);
+
+ t0 = _mm_unpackhi_epi16(in[0], in[1]);
+ t1 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ out[4] = _mm_unpacklo_epi32(t0, t1);
+ out[5] = _mm_srli_si128(out[4], 8);
+ // Note: We ignore out[6] and out[7] because
+ // they're zero vectors.
+}
+
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
+
+static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i y = _mm_loadl_epi64((__m128i const *)src);
+ y = _mm_unpacklo_epi8(y, zero);
+ y = _mm_add_epi16(*x, y);
+ y = _mm_add_epi16(y, one);
+ y = _mm_srai_epi16(y, 1);
+ y = _mm_packus_epi16(y, y);
+ return y;
+}
+
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
+ uint32_t temp;
+ __m128i u = _mm_packus_epi16(*x, *x);
+ temp = _mm_cvtsi128_si32(u);
+ *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
+ uint32_t temp;
+ __m128i y = accumulate_store(x, dst);
+ temp = _mm_cvtsi128_si32(y);
+ *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
+ accumulate_store_2_pixel };
+
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
+ __m128i u = _mm_packus_epi16(*x, *x);
+ *(int *)dst = _mm_cvtsi128_si32(u);
+}
+
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
+ __m128i y = accumulate_store(x, dst);
+ *(int *)dst = _mm_cvtsi128_si32(y);
+}
+
+static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
+ accumulate_store_4_pixel };
+
+static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store_func, uint8_t *dst) {
+ __m128i sumPairRow[4];
+ __m128i sumPairCol[8];
+ __m128i pixel;
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i zero = _mm_setzero_si128();
+
+ if (10 == tapsNum) {
+ src -= 1;
+ }
+
+ pixel = _mm_loadu_si128((__m128i const *)src);
+ sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
+ sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
+ sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
+
+ pixel = _mm_loadu_si128((__m128i const *)(src + 1));
+ sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
+ sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
+ sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
+
+ transpose_4x8(sumPairRow, sumPairCol);
+
+ sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
+ sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
+
+ sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
+ sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
+
+ sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
+ sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
+ sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
+
+ sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
+ sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
+ sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
+
+ store_func(&sumPairRow[1], dst);
+}
+
+static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store, uint8_t *buf) {
+ horiz_w4_ssse3(src, f, tapsNum, store, buf);
+ src += 4;
+ buf += 4;
+ horiz_w4_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store, uint8_t *buf) {
+ horiz_w8_ssse3(src, f, tapsNum, store, buf);
+ src += 8;
+ buf += 8;
+ horiz_w8_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store, uint8_t *buf) {
+ horiz_w16_ssse3(src, f, tapsNum, store, buf);
+ src += 16;
+ buf += 16;
+ horiz_w16_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store, uint8_t *buf) {
+ horiz_w32_ssse3(src, f, tapsNum, store, buf);
+ src += 32;
+ buf += 32;
+ horiz_w32_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+ store_pixel_t store, uint8_t *buf) {
+ horiz_w64_ssse3(src, f, tapsNum, store, buf);
+ src += 64;
+ buf += 64;
+ horiz_w64_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
+ uint8_t *) = {
+ horiz_w4_ssse3, horiz_w8_ssse3, horiz_w16_ssse3,
+ horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
+};
+
+static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
+ int width, store_pixel_t store, uint8_t *dst) {
+ switch (width) {
+ // Note:
+ // For width=2 and 4, store function must be different
+ case 2:
+ case 4: horizTab[0](src, f, tapsNum, store, dst); break;
+ case 8: horizTab[1](src, f, tapsNum, store, dst); break;
+ case 16: horizTab[2](src, f, tapsNum, store, dst); break;
+ case 32: horizTab[3](src, f, tapsNum, store, dst); break;
+ case 64: horizTab[4](src, f, tapsNum, store, dst); break;
+ case 128: horizTab[5](src, f, tapsNum, store, dst); break;
+ default: assert(0);
+ }
+}
+
+// Vertical 8-pixel parallel
+typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
+ uint8_t *dst, int dst_stride);
+
+static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ __m128i v0, v1, v2, v3;
+
+ __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ u0 = _mm_mulhrs_epi16(u0, k_256);
+ u1 = _mm_mulhrs_epi16(u1, k_256);
+ u2 = _mm_mulhrs_epi16(u2, k_256);
+ u3 = _mm_mulhrs_epi16(u3, k_256);
+ u4 = _mm_mulhrs_epi16(u4, k_256);
+ u5 = _mm_mulhrs_epi16(u5, k_256);
+ u6 = _mm_mulhrs_epi16(u6, k_256);
+ u7 = _mm_mulhrs_epi16(u7, k_256);
+
+ v0 = _mm_packus_epi16(u0, u1);
+ v1 = _mm_packus_epi16(u2, u3);
+ v2 = _mm_packus_epi16(u4, u5);
+ v3 = _mm_packus_epi16(u6, u7);
+
+ u0 = _mm_unpacklo_epi8(v0, v1);
+ u1 = _mm_unpackhi_epi8(v0, v1);
+ u2 = _mm_unpacklo_epi8(v2, v3);
+ u3 = _mm_unpackhi_epi8(v2, v3);
+
+ u4 = _mm_unpacklo_epi8(u0, u1);
+ u5 = _mm_unpacklo_epi8(u2, u3);
+ u6 = _mm_unpackhi_epi8(u0, u1);
+ u7 = _mm_unpackhi_epi8(u2, u3);
+
+ u0 = _mm_unpacklo_epi32(u4, u5);
+ u1 = _mm_unpackhi_epi32(u4, u5);
+ u2 = _mm_unpacklo_epi32(u6, u7);
+ u3 = _mm_unpackhi_epi32(u6, u7);
+
+ u4 = _mm_srli_si128(u0, 8);
+ u5 = _mm_srli_si128(u1, 8);
+ u6 = _mm_srli_si128(u2, 8);
+ u7 = _mm_srli_si128(u3, 8);
+
+ _mm_storel_epi64((__m128i *)dst, u0);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ u0 = _mm_mulhrs_epi16(u0, k_256);
+ u1 = _mm_mulhrs_epi16(u1, k_256);
+ u2 = _mm_mulhrs_epi16(u2, k_256);
+ u3 = _mm_mulhrs_epi16(u3, k_256);
+ u4 = _mm_mulhrs_epi16(u4, k_256);
+ u5 = _mm_mulhrs_epi16(u5, k_256);
+ u6 = _mm_mulhrs_epi16(u6, k_256);
+ u7 = _mm_mulhrs_epi16(u7, k_256);
+
+ v0 = _mm_packus_epi16(u0, u1);
+ v1 = _mm_packus_epi16(u2, u3);
+ v2 = _mm_packus_epi16(u4, u5);
+ v3 = _mm_packus_epi16(u6, u7);
+
+ u0 = _mm_unpacklo_epi8(v0, v1);
+ u1 = _mm_unpackhi_epi8(v0, v1);
+ u2 = _mm_unpacklo_epi8(v2, v3);
+ u3 = _mm_unpackhi_epi8(v2, v3);
+
+ u4 = _mm_unpacklo_epi8(u0, u1);
+ u5 = _mm_unpacklo_epi8(u2, u3);
+ u6 = _mm_unpackhi_epi8(u0, u1);
+ u7 = _mm_unpackhi_epi8(u2, u3);
+
+ u0 = _mm_unpacklo_epi32(u4, u5);
+ u1 = _mm_unpackhi_epi32(u4, u5);
+ u2 = _mm_unpacklo_epi32(u6, u7);
+ u3 = _mm_unpackhi_epi32(u6, u7);
+
+ u4 = _mm_srli_si128(u0, 8);
+ u5 = _mm_srli_si128(u1, 8);
+ u6 = _mm_srli_si128(u2, 8);
+ u7 = _mm_srli_si128(u3, 8);
+
+ v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+ v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
+ v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
+ v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
+ v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
+
+ u0 = _mm_unpacklo_epi8(u0, zero);
+ u1 = _mm_unpacklo_epi8(u1, zero);
+ u2 = _mm_unpacklo_epi8(u2, zero);
+ u3 = _mm_unpacklo_epi8(u3, zero);
+ u4 = _mm_unpacklo_epi8(u4, zero);
+ u5 = _mm_unpacklo_epi8(u5, zero);
+ u6 = _mm_unpacklo_epi8(u6, zero);
+ u7 = _mm_unpacklo_epi8(u7, zero);
+
+ v0 = _mm_unpacklo_epi8(v0, zero);
+ v1 = _mm_unpacklo_epi8(v1, zero);
+ v2 = _mm_unpacklo_epi8(v2, zero);
+ v3 = _mm_unpacklo_epi8(v3, zero);
+ v4 = _mm_unpacklo_epi8(v4, zero);
+ v5 = _mm_unpacklo_epi8(v5, zero);
+ v6 = _mm_unpacklo_epi8(v6, zero);
+ v7 = _mm_unpacklo_epi8(v7, zero);
+
+ v0 = _mm_adds_epi16(u0, v0);
+ v1 = _mm_adds_epi16(u4, v1);
+ v2 = _mm_adds_epi16(u1, v2);
+ v3 = _mm_adds_epi16(u5, v3);
+ v4 = _mm_adds_epi16(u2, v4);
+ v5 = _mm_adds_epi16(u6, v5);
+ v6 = _mm_adds_epi16(u3, v6);
+ v7 = _mm_adds_epi16(u7, v7);
+
+ v0 = _mm_adds_epi16(v0, one);
+ v1 = _mm_adds_epi16(v1, one);
+ v2 = _mm_adds_epi16(v2, one);
+ v3 = _mm_adds_epi16(v3, one);
+ v4 = _mm_adds_epi16(v4, one);
+ v5 = _mm_adds_epi16(v5, one);
+ v6 = _mm_adds_epi16(v6, one);
+ v7 = _mm_adds_epi16(v7, one);
+
+ v0 = _mm_srai_epi16(v0, 1);
+ v1 = _mm_srai_epi16(v1, 1);
+ v2 = _mm_srai_epi16(v2, 1);
+ v3 = _mm_srai_epi16(v3, 1);
+ v4 = _mm_srai_epi16(v4, 1);
+ v5 = _mm_srai_epi16(v5, 1);
+ v6 = _mm_srai_epi16(v6, 1);
+ v7 = _mm_srai_epi16(v7, 1);
+
+ u0 = _mm_packus_epi16(v0, v1);
+ u1 = _mm_packus_epi16(v2, v3);
+ u2 = _mm_packus_epi16(v4, v5);
+ u3 = _mm_packus_epi16(v6, v7);
+
+ u4 = _mm_srli_si128(u0, 8);
+ u5 = _mm_srli_si128(u1, 8);
+ u6 = _mm_srli_si128(u2, 8);
+ u7 = _mm_srli_si128(u3, 8);
+
+ _mm_storel_epi64((__m128i *)dst, u0);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
+ transpose8x8_accumu_to_dst };
+
+static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
+ __m128i t0, t1, t2, t3, u0, u1;
+
+ t0 = _mm_unpacklo_epi16(in[0], in[1]);
+ t1 = _mm_unpacklo_epi16(in[2], in[3]);
+ t2 = _mm_unpacklo_epi16(in[4], in[5]);
+ t3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ u0 = _mm_unpacklo_epi32(t0, t1);
+ u1 = _mm_unpacklo_epi32(t2, t3);
+
+ out[0] = _mm_unpacklo_epi64(u0, u1);
+ out[1] = _mm_unpackhi_epi64(u0, u1);
+
+ u0 = _mm_unpackhi_epi32(t0, t1);
+ u1 = _mm_unpackhi_epi32(t2, t3);
+
+ out[2] = _mm_unpacklo_epi64(u0, u1);
+ out[3] = _mm_unpackhi_epi64(u0, u1);
+
+ t0 = _mm_unpackhi_epi16(in[0], in[1]);
+ t1 = _mm_unpackhi_epi16(in[2], in[3]);
+ t2 = _mm_unpackhi_epi16(in[4], in[5]);
+ t3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ u0 = _mm_unpacklo_epi32(t0, t1);
+ u1 = _mm_unpacklo_epi32(t2, t3);
+
+ out[4] = _mm_unpacklo_epi64(u0, u1);
+ out[5] = _mm_unpackhi_epi64(u0, u1);
+
+ // Ignore out[6] and out[7]
+ // they're zero vectors.
+}
+
+static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ __m128i *f, int tapsNum, uint16_t *buf) {
+ __m128i s[8], t[6];
+ __m128i min_x2x3, max_x2x3;
+ __m128i temp;
+
+ if (tapsNum == 10) {
+ src_ptr -= 1;
+ }
+ s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
+ s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+ s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // TRANSPOSE...
+ // Vecotor represents column pixel pairs instead of a row
+ transpose_8x16(s, t);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ s[0] = _mm_maddubs_epi16(t[0], f[0]);
+ s[1] = _mm_maddubs_epi16(t[1], f[1]);
+ s[2] = _mm_maddubs_epi16(t[2], f[2]);
+ s[3] = _mm_maddubs_epi16(t[3], f[3]);
+ s[4] = _mm_maddubs_epi16(t[4], f[4]);
+ s[5] = _mm_maddubs_epi16(t[5], f[5]);
+
+ // add and saturate the results together
+ min_x2x3 = _mm_min_epi16(s[2], s[3]);
+ max_x2x3 = _mm_max_epi16(s[2], s[3]);
+ temp = _mm_adds_epi16(s[0], s[1]);
+ temp = _mm_adds_epi16(temp, s[5]);
+ temp = _mm_adds_epi16(temp, s[4]);
+
+ temp = _mm_adds_epi16(temp, min_x2x3);
+ temp = _mm_adds_epi16(temp, max_x2x3);
+
+ _mm_storeu_si128((__m128i *)buf, temp);
+}
+
+// Vertical 4-pixel parallel
+static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ __m128i v0, v1, v2, v3;
+
+ // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
+ __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpacklo_epi16(u2, u3);
+
+ v2 = _mm_unpacklo_epi32(v0, v1);
+ v3 = _mm_unpackhi_epi32(v0, v1);
+
+ u0 = _mm_mulhrs_epi16(v2, k_256);
+ u1 = _mm_mulhrs_epi16(v3, k_256);
+
+ u0 = _mm_packus_epi16(u0, u1);
+ u1 = _mm_srli_si128(u0, 4);
+ u2 = _mm_srli_si128(u0, 8);
+ u3 = _mm_srli_si128(u0, 12);
+
+ *(int *)(dst) = _mm_cvtsi128_si32(u0);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+ *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+ *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride) {
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+
+ __m128i v0, v1, v2, v3;
+
+ __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
+ __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
+ __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpacklo_epi16(u2, u3);
+
+ v2 = _mm_unpacklo_epi32(v0, v1);
+ v3 = _mm_unpackhi_epi32(v0, v1);
+
+ u0 = _mm_mulhrs_epi16(v2, k_256);
+ u1 = _mm_mulhrs_epi16(v3, k_256);
+
+ u2 = _mm_packus_epi16(u0, u1);
+ u0 = _mm_unpacklo_epi8(u2, zero);
+ u1 = _mm_unpackhi_epi8(u2, zero);
+
+ // load pixel values
+ v0 = _mm_loadl_epi64((__m128i const *)(dst));
+ v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+ v0 = _mm_unpacklo_epi8(v0, zero);
+ v1 = _mm_unpacklo_epi8(v1, zero);
+ v2 = _mm_unpacklo_epi8(v2, zero);
+ v3 = _mm_unpacklo_epi8(v3, zero);
+
+ v0 = _mm_unpacklo_epi64(v0, v1);
+ v1 = _mm_unpacklo_epi64(v2, v3);
+
+ u0 = _mm_adds_epi16(u0, v0);
+ u1 = _mm_adds_epi16(u1, v1);
+
+ u0 = _mm_adds_epi16(u0, one);
+ u1 = _mm_adds_epi16(u1, one);
+
+ u0 = _mm_srai_epi16(u0, 1);
+ u1 = _mm_srai_epi16(u1, 1);
+
+ // saturation and pack to pixels
+ u0 = _mm_packus_epi16(u0, u1);
+ u1 = _mm_srli_si128(u0, 4);
+ u2 = _mm_srli_si128(u0, 8);
+ u3 = _mm_srli_si128(u0, 12);
+
+ *(int *)(dst) = _mm_cvtsi128_si32(u0);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+ *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+ *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
+ transpose4x4_accumu_to_dst };
+
+static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ __m128i *f, int tapsNum, uint16_t *buf) {
+ __m128i A, B, C, D;
+ __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
+ __m128i x0, x1, x2, x3, x4, x5;
+ __m128i min_x2x3, max_x2x3, temp;
+
+ if (tapsNum == 10) {
+ src_ptr -= 1;
+ }
+ A = _mm_loadu_si128((const __m128i *)src_ptr);
+ B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+
+ // TRANSPOSE...
+ // Vecotor represents column pixel pairs instead of a row
+ // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+ tr0_0 = _mm_unpacklo_epi16(A, B);
+ // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+ tr0_1 = _mm_unpacklo_epi16(C, D);
+ // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+ s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+ s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ // 02 03 12 13 22 23 32 33
+ s3s2 = _mm_srli_si128(s1s0, 8);
+ // 06 07 16 17 26 27 36 37
+ s7s6 = _mm_srli_si128(s5s4, 8);
+
+ tr0_0 = _mm_unpackhi_epi16(A, B);
+ tr0_1 = _mm_unpackhi_epi16(C, D);
+ s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ sbsa = _mm_srli_si128(s9s8, 8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ x0 = _mm_maddubs_epi16(s1s0, f[0]);
+ x1 = _mm_maddubs_epi16(s3s2, f[1]);
+ x2 = _mm_maddubs_epi16(s5s4, f[2]);
+ x3 = _mm_maddubs_epi16(s7s6, f[3]);
+ x4 = _mm_maddubs_epi16(s9s8, f[4]);
+ x5 = _mm_maddubs_epi16(sbsa, f[5]);
+ // add and saturate the results together
+ min_x2x3 = _mm_min_epi16(x2, x3);
+ max_x2x3 = _mm_max_epi16(x2, x3);
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x5);
+ temp = _mm_adds_epi16(temp, x4);
+
+ temp = _mm_adds_epi16(temp, min_x2x3);
+ temp = _mm_adds_epi16(temp, max_x2x3);
+ _mm_storel_epi64((__m128i *)buf, temp);
+}
+
+// Note:
+// This function assumes:
+// (1) 10/12-taps filters
+// (2) x_step_q4 = 16 then filter is fixed at the call
+
+void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams filter_params,
+ const int subpel_x_q4, int x_step_q4,
+ ConvolveParams *conv_params) {
+ DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
+ __m128i verf[6];
+ __m128i horf[2];
+ SubpelFilterCoeffs hCoeffs, vCoeffs;
+ const uint8_t *src_ptr;
+ store_pixel_t store2p = store2pixelTab[conv_params->ref];
+ store_pixel_t store4p = store4pixelTab[conv_params->ref];
+ transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
+ transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
+
+ const int tapsNum = filter_params.taps;
+ int block_height, block_residu;
+ int i, col, count;
+ (void)x_step_q4;
+
+ if (0 == subpel_x_q4 || 16 != x_step_q4) {
+ av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+ subpel_x_q4, x_step_q4, conv_params);
+ return;
+ }
+
+ hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
+ vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+
+ if (!hCoeffs || !vCoeffs) {
+ av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+ subpel_x_q4, x_step_q4, conv_params);
+ return;
+ }
+
+ verf[0] = *((const __m128i *)(vCoeffs));
+ verf[1] = *((const __m128i *)(vCoeffs + 1));
+ verf[2] = *((const __m128i *)(vCoeffs + 2));
+ verf[3] = *((const __m128i *)(vCoeffs + 3));
+ verf[4] = *((const __m128i *)(vCoeffs + 4));
+ verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+ horf[0] = *((const __m128i *)(hCoeffs));
+ horf[1] = *((const __m128i *)(hCoeffs + 1));
+
+ count = 0;
+
+ // here tapsNum is filter size
+ src -= (tapsNum >> 1) - 1;
+ src_ptr = src;
+ if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
+ // 8-pixels parallel
+ block_height = h >> 3;
+ block_residu = h & 7;
+
+ do {
+ for (col = 0; col < w; col += 8) {
+ for (i = 0; i < 8; ++i) {
+ filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
+ temp + (i * 8));
+ src_ptr += 1;
+ }
+ transpose_8x8(temp, 8, dst + col, dst_stride);
+ }
+ count++;
+ src_ptr = src + count * src_stride * 8;
+ dst += dst_stride * 8;
+ } while (count < block_height);
+
+ for (i = 0; i < block_residu; ++i) {
+ filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ }
+ } else {
+ if (w > 2) {
+ // 4-pixels parallel
+ block_height = h >> 2;
+ block_residu = h & 3;
+
+ do {
+ for (col = 0; col < w; col += 4) {
+ for (i = 0; i < 4; ++i) {
+ filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
+ temp + (i * 4));
+ src_ptr += 1;
+ }
+ transpose_4x4(temp, 4, dst + col, dst_stride);
+ }
+ count++;
+ src_ptr = src + count * src_stride * 4;
+ dst += dst_stride * 4;
+ } while (count < block_height);
+
+ for (i = 0; i < block_residu; ++i) {
+ filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ }
+ } else {
+ for (i = 0; i < h; i++) {
+ filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ }
+ }
+ }
+}
+
+// Vertical convolution filtering
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
+ __m128i u = _mm_packus_epi16(*x, *x);
+ _mm_storel_epi64((__m128i *)dst, u);
+}
+
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
+ __m128i y = accumulate_store(x, dst);
+ _mm_storel_epi64((__m128i *)dst, y);
+}
+
+static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
+ accumulate_store_8_pixel };
+
+static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
+ int tapsNum, __m128i *f) {
+ __m128i s[12];
+ const __m128i k_256 = _mm_set1_epi16(1 << 8);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i min_x2x3, max_x2x3, sum;
+ int i = 0;
+ int r = 0;
+
+ if (10 == tapsNum) {
+ i += 1;
+ s[0] = zero;
+ }
+ while (i < 12) {
+ s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+ i += 1;
+ r += 1;
+ }
+
+ s[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s[2] = _mm_unpacklo_epi8(s[2], s[3]);
+ s[4] = _mm_unpacklo_epi8(s[4], s[5]);
+ s[6] = _mm_unpacklo_epi8(s[6], s[7]);
+ s[8] = _mm_unpacklo_epi8(s[8], s[9]);
+ s[10] = _mm_unpacklo_epi8(s[10], s[11]);
+
+ s[0] = _mm_maddubs_epi16(s[0], f[0]);
+ s[2] = _mm_maddubs_epi16(s[2], f[1]);
+ s[4] = _mm_maddubs_epi16(s[4], f[2]);
+ s[6] = _mm_maddubs_epi16(s[6], f[3]);
+ s[8] = _mm_maddubs_epi16(s[8], f[4]);
+ s[10] = _mm_maddubs_epi16(s[10], f[5]);
+
+ min_x2x3 = _mm_min_epi16(s[4], s[6]);
+ max_x2x3 = _mm_max_epi16(s[4], s[6]);
+ sum = _mm_adds_epi16(s[0], s[2]);
+ sum = _mm_adds_epi16(sum, s[10]);
+ sum = _mm_adds_epi16(sum, s[8]);
+
+ sum = _mm_adds_epi16(sum, min_x2x3);
+ sum = _mm_adds_epi16(sum, max_x2x3);
+
+ sum = _mm_mulhrs_epi16(sum, k_256);
+ sum = _mm_packus_epi16(sum, sum);
+ sum = _mm_unpacklo_epi8(sum, zero);
+ return sum;
+}
+
+static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
+ __m128i *f, int tapsNum,
+ store_pixel_t store_func,
+ uint8_t *dst) {
+ __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
+ store_func(&sum, dst);
+}
+
+static void filter_vert_compute_small(const uint8_t *src, int src_stride,
+ __m128i *f, int tapsNum,
+ store_pixel_t store_func, int h,
+ uint8_t *dst, int dst_stride) {
+ int rowIndex = 0;
+ do {
+ filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
+ dst);
+ rowIndex++;
+ src += src_stride;
+ dst += dst_stride;
+ } while (rowIndex < h);
+}
+
+static void filter_vert_compute_large(const uint8_t *src, int src_stride,
+ __m128i *f, int tapsNum,
+ store_pixel_t store_func, int w, int h,
+ uint8_t *dst, int dst_stride) {
+ int col;
+ int rowIndex = 0;
+ const uint8_t *src_ptr = src;
+ uint8_t *dst_ptr = dst;
+
+ do {
+ for (col = 0; col < w; col += 8) {
+ filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
+ store_func, dst_ptr);
+ src_ptr += 8;
+ dst_ptr += 8;
+ }
+ rowIndex++;
+ src_ptr = src + rowIndex * src_stride;
+ dst_ptr = dst + rowIndex * dst_stride;
+ } while (rowIndex < h);
+}
+
+void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams filter_params,
+ const int subpel_y_q4, int y_step_q4,
+ ConvolveParams *conv_params) {
+ __m128i verf[6];
+ SubpelFilterCoeffs vCoeffs;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr = dst;
+ store_pixel_t store2p = store2pixelTab[conv_params->ref];
+ store_pixel_t store4p = store4pixelTab[conv_params->ref];
+ store_pixel_t store8p = store8pixelTab[conv_params->ref];
+ const int tapsNum = filter_params.taps;
+
+ if (0 == subpel_y_q4 || 16 != y_step_q4) {
+ av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+ subpel_y_q4, y_step_q4, conv_params);
+ return;
+ }
+
+ vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+
+ if (!vCoeffs) {
+ av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+ subpel_y_q4, y_step_q4, conv_params);
+ return;
+ }
+
+ verf[0] = *((const __m128i *)(vCoeffs));
+ verf[1] = *((const __m128i *)(vCoeffs + 1));
+ verf[2] = *((const __m128i *)(vCoeffs + 2));
+ verf[3] = *((const __m128i *)(vCoeffs + 3));
+ verf[4] = *((const __m128i *)(vCoeffs + 4));
+ verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+ src -= src_stride * ((tapsNum >> 1) - 1);
+ src_ptr = src;
+
+ if (w > 4) {
+ filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
+ dst_ptr, dst_stride);
+ } else if (4 == w) {
+ filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
+ dst_ptr, dst_stride);
+ } else if (2 == w) {
+ filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
+ dst_ptr, dst_stride);
+ } else {
+ assert(0);
+ }
+}
+
+static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
+ int8_t (*simd_horiz_filter)[2][16]) {
+ int shift;
+ int offset = (12 - taps) / 2;
+ const int16_t *filter_row;
+ for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+ int i;
+ filter_row = filter_ptr + shift * taps;
+ for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+ for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
+
+ for (i = 0; i < taps; ++i) {
+ simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
+ simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
+ }
+
+ for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+ for (i = offset + 2 + taps; i < 16; ++i)
+ simd_horiz_filter[shift - 1][1][i] = 0;
+ }
+}
+
+static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
+ int8_t (*simd_vert_filter)[6][16]) {
+ int shift;
+ int offset = (12 - taps) / 2;
+ const int16_t *filter_row;
+ for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+ int i;
+ filter_row = filter_ptr + shift * taps;
+ for (i = 0; i < 6; ++i) {
+ int j;
+ for (j = 0; j < 16; ++j) {
+ int c = i * 2 + (j % 2) - offset;
+ if (c >= 0 && c < taps)
+ simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
+ else
+ simd_vert_filter[shift - 1][i][j] = 0;
+ }
+ }
+ }
+}
+
+typedef struct SimdFilter {
+ InterpFilter interp_filter;
+ int8_t (*simd_horiz_filter)[2][16];
+ int8_t (*simd_vert_filter)[6][16];
+} SimdFilter;
+
+#if CONFIG_DUAL_FILTER
+#define MULTITAP_FILTER_NUM 1
+SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
+ { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
+ &sub_pel_filters_12sharp_ver_signal_dir[0] },
+};
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+SimdFilter temporal_simd_filter = {
+ TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
+ &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
+};
+#endif
+
+void av1_lowbd_convolve_init_ssse3(void) {
+#if USE_TEMPORALFILTER_12TAP
+ {
+ InterpFilterParams filter_params =
+ av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
+ int taps = filter_params.taps;
+ const int16_t *filter_ptr = filter_params.filter_ptr;
+ init_simd_horiz_filter(filter_ptr, taps,
+ temporal_simd_filter.simd_horiz_filter);
+ init_simd_vert_filter(filter_ptr, taps,
+ temporal_simd_filter.simd_vert_filter);
+ }
+#endif
+#if CONFIG_DUAL_FILTER
+ {
+ int i;
+ for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
+ InterpFilter interp_filter = simd_filters[i].interp_filter;
+ InterpFilterParams filter_params =
+ av1_get_interp_filter_params(interp_filter);
+ int taps = filter_params.taps;
+ const int16_t *filter_ptr = filter_params.filter_ptr;
+ init_simd_horiz_filter(filter_ptr, taps,
+ simd_filters[i].simd_horiz_filter);
+ init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
+ }
+ }
+#endif
+ return;
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..d04b667f1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,839 @@
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[32];
+ __m128i buf1[32];
+ int col_num = txfm_size / num_per_128;
+ int bit;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int32_t stage_idx = 0;
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+ buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+ buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+ buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+ buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+ buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+ buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+ buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+ buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+ buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+ buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+ buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+ buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+ buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+ buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+ buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+ // stage 2
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+ buf1[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+ buf1[3], bit);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+ buf1[15], bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], bit);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], bit);
+
+ // stage 9
+ stage_idx++;
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[16];
+ buf1[2] = buf0[8];
+ buf1[3] = buf0[24];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[20];
+ buf1[6] = buf0[12];
+ buf1[7] = buf0[28];
+ buf1[8] = buf0[2];
+ buf1[9] = buf0[18];
+ buf1[10] = buf0[10];
+ buf1[11] = buf0[26];
+ buf1[12] = buf0[6];
+ buf1[13] = buf0[22];
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[30];
+ buf1[16] = buf0[1];
+ buf1[17] = buf0[17];
+ buf1[18] = buf0[9];
+ buf1[19] = buf0[25];
+ buf1[20] = buf0[5];
+ buf1[21] = buf0[21];
+ buf1[22] = buf0[13];
+ buf1[23] = buf0[29];
+ buf1[24] = buf0[3];
+ buf1[25] = buf0[19];
+ buf1[26] = buf0[11];
+ buf1[27] = buf0[27];
+ buf1[28] = buf0[7];
+ buf1[29] = buf0[23];
+ buf1[30] = buf0[15];
+ buf1[31] = buf0[31];
+
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 4;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[4];
+ __m128i buf1[4];
+ int col_num = txfm_size / num_per_128;
+ int bit;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int32_t stage_idx = 0;
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ stage_idx++;
+ buf1[0] = buf0[3];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[1];
+ buf1[3] = buf0[2];
+
+ // stage 2
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
+
+ // stage 3
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+ // stage 4
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
+
+ // stage 5
+ stage_idx++;
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+ buf1[2] = buf0[3];
+ buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[32];
+ __m128i buf1[32];
+ int col_num = txfm_size / num_per_128;
+ int bit;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int32_t stage_idx = 0;
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ stage_idx++;
+ buf1[0] = buf0[31];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[29];
+ buf1[3] = buf0[2];
+ buf1[4] = buf0[27];
+ buf1[5] = buf0[4];
+ buf1[6] = buf0[25];
+ buf1[7] = buf0[6];
+ buf1[8] = buf0[23];
+ buf1[9] = buf0[8];
+ buf1[10] = buf0[21];
+ buf1[11] = buf0[10];
+ buf1[12] = buf0[19];
+ buf1[13] = buf0[12];
+ buf1[14] = buf0[17];
+ buf1[15] = buf0[14];
+ buf1[16] = buf0[15];
+ buf1[17] = buf0[16];
+ buf1[18] = buf0[13];
+ buf1[19] = buf0[18];
+ buf1[20] = buf0[11];
+ buf1[21] = buf0[20];
+ buf1[22] = buf0[9];
+ buf1[23] = buf0[22];
+ buf1[24] = buf0[7];
+ buf1[25] = buf0[24];
+ buf1[26] = buf0[5];
+ buf1[27] = buf0[26];
+ buf1[28] = buf0[3];
+ buf1[29] = buf0[28];
+ buf1[30] = buf0[1];
+ buf1[31] = buf0[30];
+
+ // stage 2
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+ bit);
+ btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+ bit);
+ btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
+ btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+ buf0[9], bit);
+ btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+ buf0[17], bit);
+ btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
+ btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
+
+ // stage 3
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+ buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+ buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+ buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+ buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+ buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+ buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+ buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+ buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+ buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+ buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+ buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+ buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+ buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+ buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+ buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+ // stage 4
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+ buf0[17], bit);
+ btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
+ btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
+
+ // stage 5
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+ buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+ buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+ buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+ buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+ buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+ buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+ buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+ buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+ buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+ buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+ buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+ buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+ buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+ buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+ buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+ // stage 6
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+ bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ buf0[21] = buf1[21];
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
+
+ // stage 7
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+ buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+ buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+ buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+ buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+ buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+ buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+ buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+ buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+ buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+ buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+ buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+ buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+ // stage 8
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[26] = buf1[26];
+ buf0[27] = buf1[27];
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
+
+ // stage 9
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+ buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+ buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+ buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+ buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+ buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+ buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+ buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+ buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+ buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+ buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+ buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+ buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+ // stage 10
+ stage_idx++;
+ bit = cos_bit[stage_idx];
+ cospi = cospi_arr[bit - cos_bit_min];
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
+ buf0[20] = buf1[20];
+ buf0[21] = buf1[21];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
+
+ // stage 11
+ stage_idx++;
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+ buf1[2] = buf0[24];
+ buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+ buf1[4] = buf0[12];
+ buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+ buf1[6] = buf0[20];
+ buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+ buf1[8] = buf0[6];
+ buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+ buf1[10] = buf0[30];
+ buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+ buf1[12] = buf0[10];
+ buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+ buf1[14] = buf0[18];
+ buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+ buf1[16] = buf0[3];
+ buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+ buf1[18] = buf0[27];
+ buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+ buf1[20] = buf0[15];
+ buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+ buf1[22] = buf0[23];
+ buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+ buf1[24] = buf0[5];
+ buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+ buf1[26] = buf0[29];
+ buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+ buf1[28] = buf0[9];
+ buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+ buf1[30] = buf0[17];
+ buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..78c261374
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+ const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+ int r, c;
+ for (r = 0; r < txfm1d_size; r++) {
+ for (c = 0; c < txfm1d_size; c++) {
+ output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+ }
+ }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
+ case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
+ default: assert(0);
+ }
+ return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+ const int stride, const TXFM_2D_CFG *cfg,
+ int32_t *txfm_buf) {
+ const int txfm_size = cfg->txfm_size;
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t *stage_range_row = cfg->stage_range_row;
+ const int8_t *cos_bit_col = cfg->cos_bit_col;
+ const int8_t *cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+ int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+ txfm_size);
+ round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+ txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+ round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, int tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+ TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, int tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
new file mode 100644
index 000000000..cf6249bdc
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
+#endif
+
+typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+
+typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
+ int src_stride, uint16_t *dst, int dst_stride,
+ int bd);
+
+static INLINE HbdSubpelFilterCoeffs
+hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+ if (p.interp_filter == MULTITAP_SHARP) {
+ return &subpel_filters_sharp[index][0];
+ }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+ if (p.interp_filter == TEMPORALFILTER_12TAP) {
+ return &subpel_temporalfilter[index][0];
+ }
+#endif
+ (void)p;
+ (void)index;
+ return NULL;
+}
+
+static void init_simd_filter(const int16_t *filter_ptr, int taps,
+ int16_t (*simd_filter)[6][8]) {
+ int shift;
+ int offset = (12 - taps) / 2;
+ for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+ const int16_t *filter_row = filter_ptr + shift * taps;
+ int i, j;
+ for (i = 0; i < 12; ++i) {
+ for (j = 0; j < 4; ++j) {
+ int r = i / 2;
+ int c = j * 2 + (i % 2);
+ if (i - offset >= 0 && i - offset < taps)
+ simd_filter[shift - 1][r][c] = filter_row[i - offset];
+ else
+ simd_filter[shift - 1][r][c] = 0;
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_init_sse4_1(void) {
+#if USE_TEMPORALFILTER_12TAP
+ {
+ InterpFilterParams filter_params =
+ av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
+ int taps = filter_params.taps;
+ const int16_t *filter_ptr = filter_params.filter_ptr;
+ init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
+ }
+#endif
+#if CONFIG_DUAL_FILTER
+ {
+ InterpFilterParams filter_params =
+ av1_get_interp_filter_params(MULTITAP_SHARP);
+ int taps = filter_params.taps;
+ const int16_t *filter_ptr = filter_params.filter_ptr;
+ init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
+ }
+#endif
+}
+
+// pixelsNum 0: write all 4 pixels
+// 1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
+ int dst_stride) {
+ if (2 == width) {
+ if (0 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+ *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+ } else if (1 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ } else if (2 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ } else if (3 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+ }
+ } else {
+ if (0 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+ } else if (1 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ } else if (2 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ } else if (3 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+ }
+ }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+ int i;
+
+ for (i = 0; i < numVecs; i++) {
+ mask = _mm_cmpgt_epi16(p[i], max);
+ clamped = _mm_andnot_si128(mask, p[i]);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ p[i] = _mm_and_si128(clamped, mask);
+ }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+ __m128i v0, v1;
+ __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+ u[0] = _mm_loadu_si128((__m128i const *)src);
+ u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+ u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ u[0] = _mm_add_epi32(u[0], rnd);
+ u[1] = _mm_add_epi32(u[1], rnd);
+ u[2] = _mm_add_epi32(u[2], rnd);
+ u[3] = _mm_add_epi32(u[3], rnd);
+
+ u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+ u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+ u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+ u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+ u[0] = _mm_packus_epi32(u[0], u[1]);
+ u[1] = _mm_packus_epi32(u[2], u[3]);
+
+ highbd_clip(u, 2, bd);
+
+ v0 = _mm_unpacklo_epi16(u[0], u[1]);
+ v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(v0, v1);
+ u[2] = _mm_unpackhi_epi16(v0, v1);
+
+ u[1] = _mm_srli_si128(u[0], 8);
+ u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0 : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int bd) {
+ __m128i u[4];
+ transClipPixel(src, src_stride, u, bd);
+ writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
+ int src_stride, uint16_t *dst, int dst_stride,
+ int bd) {
+ __m128i u[4], v[4];
+ const __m128i ones = _mm_set1_epi16(1);
+
+ transClipPixel(src, src_stride, u, bd);
+
+ v[0] = _mm_loadl_epi64((__m128i const *)dst);
+ v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+ v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+ v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+ u[0] = _mm_add_epi16(u[0], v[0]);
+ u[1] = _mm_add_epi16(u[1], v[1]);
+ u[2] = _mm_add_epi16(u[2], v[2]);
+ u[3] = _mm_add_epi16(u[3], v[3]);
+
+ u[0] = _mm_add_epi16(u[0], ones);
+ u[1] = _mm_add_epi16(u[1], ones);
+ u[2] = _mm_add_epi16(u[2], ones);
+ u[3] = _mm_add_epi16(u[3], ones);
+
+ u[0] = _mm_srai_epi16(u[0], 1);
+ u[1] = _mm_srai_epi16(u[1], 1);
+ u[2] = _mm_srai_epi16(u[2], 1);
+ u[3] = _mm_srai_epi16(u[3], 1);
+
+ writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+ __m128i x0, x1;
+
+ x0 = _mm_unpacklo_epi32(in[0], in[1]);
+ x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+ out[0] = _mm_unpacklo_epi64(x0, x1);
+ out[1] = _mm_unpackhi_epi64(x0, x1);
+
+ x0 = _mm_unpackhi_epi32(in[0], in[1]);
+ x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ out[2] = _mm_unpacklo_epi64(x0, x1);
+ out[3] = _mm_unpackhi_epi64(x0, x1);
+
+ x0 = _mm_unpacklo_epi32(in[4], in[5]);
+ x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+ out[4] = _mm_unpacklo_epi64(x0, x1);
+ out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
+ int tapsNum, uint32_t *buf) {
+ __m128i u[8], v[6];
+
+ if (tapsNum == 10) {
+ src -= 1;
+ }
+
+ u[0] = _mm_loadu_si128((__m128i const *)src);
+ u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+ u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+ u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+ u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+ u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+ transpose_pair(u, v);
+
+ u[0] = _mm_madd_epi16(v[0], f[0]);
+ u[1] = _mm_madd_epi16(v[1], f[1]);
+ u[2] = _mm_madd_epi16(v[2], f[2]);
+ u[3] = _mm_madd_epi16(v[3], f[3]);
+ u[4] = _mm_madd_epi16(v[4], f[4]);
+ u[5] = _mm_madd_epi16(v[5], f[5]);
+
+ u[6] = _mm_min_epi32(u[2], u[3]);
+ u[7] = _mm_max_epi32(u[2], u[3]);
+
+ u[0] = _mm_add_epi32(u[0], u[1]);
+ u[0] = _mm_add_epi32(u[0], u[5]);
+ u[0] = _mm_add_epi32(u[0], u[4]);
+ u[0] = _mm_add_epi32(u[0], u[6]);
+ u[0] = _mm_add_epi32(u[0], u[7]);
+
+ _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h,
+ const InterpFilterParams filter_params,
+ const int subpel_x_q4, int x_step_q4,
+ int avg, int bd) {
+ DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+ __m128i verf[6];
+ HbdSubpelFilterCoeffs vCoeffs;
+ const uint16_t *srcPtr;
+ const int tapsNum = filter_params.taps;
+ int i, col, count, blkResidu, blkHeight;
+ TransposeSave transSave = transSaveTab[avg];
+ (void)x_step_q4;
+
+ if (0 == subpel_x_q4 || 16 != x_step_q4) {
+ av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params, subpel_x_q4, x_step_q4, avg, bd);
+ return;
+ }
+
+ vCoeffs =
+ hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+ if (!vCoeffs) {
+ av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params, subpel_x_q4, x_step_q4, avg, bd);
+ return;
+ }
+
+ verf[0] = *((const __m128i *)(vCoeffs));
+ verf[1] = *((const __m128i *)(vCoeffs + 1));
+ verf[2] = *((const __m128i *)(vCoeffs + 2));
+ verf[3] = *((const __m128i *)(vCoeffs + 3));
+ verf[4] = *((const __m128i *)(vCoeffs + 4));
+ verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+ src -= (tapsNum >> 1) - 1;
+ srcPtr = src;
+
+ count = 0;
+ blkHeight = h >> 2;
+ blkResidu = h & 3;
+
+ while (blkHeight != 0) {
+ for (col = 0; col < w; col += 4) {
+ for (i = 0; i < 4; ++i) {
+ highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+ srcPtr += 1;
+ }
+ transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+ }
+ count++;
+ srcPtr = src + count * src_stride * 4;
+ dst += dst_stride * 4;
+ blkHeight--;
+ }
+
+ if (blkResidu == 0) return;
+
+ for (col = 0; col < w; col += 4) {
+ for (i = 0; i < 4; ++i) {
+ highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+ srcPtr += 1;
+ }
+ transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+ }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+ __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+ u[0] = _mm_add_epi32(u[0], rnd);
+ u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+ u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+ const __m128i ones = _mm_set1_epi16(1);
+
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+
+ v = _mm_add_epi16(v, u[0]);
+ v = _mm_add_epi16(v, ones);
+ v = _mm_srai_epi16(v, 1);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+ const __m128i ones = _mm_set1_epi16(1);
+
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+
+ v = _mm_add_epi16(v, u[0]);
+ v = _mm_add_epi16(v, ones);
+ v = _mm_srai_epi16(v, 1);
+ _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+ const __m128i *f, int taps,
+ uint16_t *dst, WritePixels saveFunc,
+ int bd) {
+ __m128i s[12];
+ __m128i zero = _mm_setzero_si128();
+ int i = 0;
+ int r = 0;
+
+ // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+ if (10 == taps) {
+ i += 1;
+ s[0] = zero;
+ }
+ while (i < 12) {
+ s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+ i += 1;
+ r += 1;
+ }
+
+ s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+ s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+ s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+ s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+ s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+ s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+ s[0] = _mm_madd_epi16(s[0], f[0]);
+ s[2] = _mm_madd_epi16(s[2], f[1]);
+ s[4] = _mm_madd_epi16(s[4], f[2]);
+ s[6] = _mm_madd_epi16(s[6], f[3]);
+ s[8] = _mm_madd_epi16(s[8], f[4]);
+ s[10] = _mm_madd_epi16(s[10], f[5]);
+
+ s[1] = _mm_min_epi32(s[4], s[6]);
+ s[3] = _mm_max_epi32(s[4], s[6]);
+
+ s[0] = _mm_add_epi32(s[0], s[2]);
+ s[0] = _mm_add_epi32(s[0], s[10]);
+ s[0] = _mm_add_epi32(s[0], s[8]);
+ s[0] = _mm_add_epi32(s[0], s[1]);
+ s[0] = _mm_add_epi32(s[0], s[3]);
+
+ saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+ int src_stride, const __m128i *f,
+ int taps, int w, int h,
+ uint16_t *dst, int dst_stride,
+ int avg, int bd) {
+ int col;
+ int rowIndex = 0;
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+ const int step = 4;
+ WritePixels write4pixels = write4pixelsTab[avg];
+
+ do {
+ for (col = 0; col < w; col += step) {
+ filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
+ write4pixels, bd);
+ src_ptr += step;
+ dst_ptr += step;
+ }
+ rowIndex++;
+ src_ptr = src + rowIndex * src_stride;
+ dst_ptr = dst + rowIndex * dst_stride;
+ } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+ int src_stride, const __m128i *f,
+ int taps, int w, int h,
+ uint16_t *dst, int dst_stride,
+ int avg, int bd) {
+ int rowIndex = 0;
+ WritePixels write2pixels = write2pixelsTab[avg];
+ (void)w;
+
+ do {
+ filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
+ rowIndex++;
+ src += src_stride;
+ dst += dst_stride;
+ } while (rowIndex < h);
+}
+
+void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h,
+ const InterpFilterParams filter_params,
+ const int subpel_y_q4, int y_step_q4,
+ int avg, int bd) {
+ __m128i verf[6];
+ HbdSubpelFilterCoeffs vCoeffs;
+ const int tapsNum = filter_params.taps;
+
+ if (0 == subpel_y_q4 || 16 != y_step_q4) {
+ av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params, subpel_y_q4, y_step_q4, avg, bd);
+ return;
+ }
+
+ vCoeffs =
+ hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+ if (!vCoeffs) {
+ av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params, subpel_y_q4, y_step_q4, avg, bd);
+ return;
+ }
+
+ verf[0] = *((const __m128i *)(vCoeffs));
+ verf[1] = *((const __m128i *)(vCoeffs + 1));
+ verf[2] = *((const __m128i *)(vCoeffs + 2));
+ verf[3] = *((const __m128i *)(vCoeffs + 3));
+ verf[4] = *((const __m128i *)(vCoeffs + 4));
+ verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+ src -= src_stride * ((tapsNum >> 1) - 1);
+
+ if (w > 2) {
+ highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
+ dst_stride, avg, bd);
+ } else {
+ highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
+ dst_stride, avg, bd);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..af7afb7ee
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+ __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+ __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+ __m128i *output) {
+ const int num_per_128 = 4;
+ const int row_size = txfm_size;
+ const int col_size = txfm_size / num_per_128;
+ int r, c;
+
+ // transpose each 4x4 block internally
+ for (r = 0; r < row_size; r += 4) {
+ for (c = 0; c < col_size; c++) {
+ transpose_32_4x4(col_size, &input[r * col_size + c],
+ &output[c * 4 * col_size + r / 4]);
+ }
+ }
+}
+
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+ __m128i tmp, round;
+ round = _mm_set1_epi32(1 << (bit - 1));
+ tmp = _mm_add_epi32(vec, round);
+ return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+ const int size, const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = round_shift_32_sse4_1(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
+ ww0 = _mm_set1_epi32(w0); \
+ ww1 = _mm_set1_epi32(w1); \
+ in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = round_shift_32_sse4_1(out0, bit); \
+ in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
+ ww0 = _mm_set1_epi32(w0); \
+ ww1 = _mm_set1_epi32(w1); \
+ in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = round_shift_32_sse4_1(out0, bit); \
+ in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in1_w0, in0_w1); \
+ out1 = round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 000000000..4f77da446
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#if USE_3TAP_INTRA_FILTER
+void filterintra_sse4_3tap_dummy_func(void);
+void filterintra_sse4_3tap_dummy_func(void) {}
+#else
+
+static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
+ __m128i *sum) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u0 = _mm_unpacklo_epi8(a, zero);
+ __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(u0, u1);
+}
+
+static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsSmall(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 4;
+ sum_value >>= 3;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsSmall(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 8;
+ sum_value >>= 4;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
+ __m128i *sum) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u0 = _mm_unpacklo_epi8(a, zero);
+ __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(u0, u1);
+
+ u0 = _mm_unpackhi_epi8(a, zero);
+ u1 = _mm_unpackhi_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(sum[0], u0);
+ sum[0] = _mm_add_epi16(sum[0], u1);
+}
+
+static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsLarge(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 16;
+ sum_value >>= 5;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector[2], u;
+ uint16_t sum_value;
+
+ AddPixelsLarge(above, left, &sum_vector[0]);
+ AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
+
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector[0], 2);
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+
+ sum_value = _mm_extract_epi16(sum_vector[0], 0);
+ sum_value += 32;
+ sum_value >>= 6;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+// Note:
+// params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
+ const uint8_t *left, int bs,
+ __m128i *params) {
+ int meanValue = 0;
+ switch (bs) {
+ case 4: meanValue = GetMeanValue4x4(above, left, params); break;
+ case 8: meanValue = GetMeanValue8x8(above, left, params); break;
+ case 16: meanValue = GetMeanValue16x16(above, left, params); break;
+ case 32: meanValue = GetMeanValue32x32(above, left, params); break;
+ default: assert(0);
+ }
+ return meanValue;
+}
+
+// Note:
+// params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
+//
+static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
+ const TX_SIZE tx_size =
+ (bs == 32) ? TX_32X32
+ : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+ // c0
+ params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
+ av1_filter_intra_taps_4[tx_size][mode][0],
+ av1_filter_intra_taps_4[tx_size][mode][0],
+ av1_filter_intra_taps_4[tx_size][mode][0]);
+ // c1
+ params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
+ av1_filter_intra_taps_4[tx_size][mode][1],
+ av1_filter_intra_taps_4[tx_size][mode][1],
+ av1_filter_intra_taps_4[tx_size][mode][1]);
+ // c2
+ params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
+ av1_filter_intra_taps_4[tx_size][mode][2],
+ av1_filter_intra_taps_4[tx_size][mode][2],
+ av1_filter_intra_taps_4[tx_size][mode][2]);
+ // c3
+ params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
+ av1_filter_intra_taps_4[tx_size][mode][3],
+ av1_filter_intra_taps_4[tx_size][mode][3],
+ av1_filter_intra_taps_4[tx_size][mode][3]);
+}
+
+static const int maxBlkSize = 32;
+
+static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
+ __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
+ __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ *((int *)dst) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
+}
+
+static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3;
+ int r = 0;
+
+ while (r < 8) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ r += 1;
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += stride;
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3;
+ int r = 0;
+
+ while (r < 16) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)(dst + 8), p0);
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+ int r = 0;
+
+ while (r < 32) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+ p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
+ p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
+ p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
+ p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p4 = _mm_add_epi32(p4, mean[0]);
+ p5 = _mm_add_epi32(p5, mean[0]);
+ p6 = _mm_add_epi32(p6, mean[0]);
+ p7 = _mm_add_epi32(p7, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ p4 = _mm_packus_epi32(p4, p5);
+ p5 = _mm_packus_epi32(p6, p7);
+ p4 = _mm_packus_epi16(p4, p5);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)(dst + 8), p0);
+
+ _mm_storel_epi64((__m128i *)(dst + 16), p4);
+ p4 = _mm_srli_si128(p4, 8);
+ _mm_storel_epi64((__m128i *)(dst + 24), p4);
+
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
+ ptrdiff_t stride) {
+ switch (bs) {
+ case 4: SavePred4x4(pred, mean, dst, stride); break;
+ case 8: SavePred8x8(pred, mean, dst, stride); break;
+ case 16: SavePred16x16(pred, mean, dst, stride); break;
+ case 32: SavePred32x32(pred, mean, dst, stride); break;
+ default: assert(0);
+ }
+}
+
+typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride);
+
+static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+
+ sum = _mm_extract_epi32(u0, 2);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 3) = x;
+
+ sum = _mm_extract_epi32(u0, 3);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 4) = x;
+}
+
+static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+
+ sum = _mm_extract_epi32(u0, 2);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 3) = x;
+}
+
+static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+}
+
+static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+}
+
+static ProducePixelsFunc prodPixelsFuncTab[4] = {
+ ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
+};
+
+static void ProducePixels(int *pred, const __m128i *prm, int remain) {
+ __m128i p[3];
+ const int predStride = (maxBlkSize << 1) + 1;
+ int index;
+
+ p[0] = _mm_loadu_si128((const __m128i *)pred);
+ p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
+ p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
+
+ if (remain <= 2) {
+ return;
+ }
+ if (remain > 5) {
+ index = 3;
+ } else {
+ index = remain - 3;
+ }
+ prodPixelsFuncTab[index](p, prm, pred, predStride);
+}
+
+// Note:
+// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
+ const int bs, const __m128i *prm, int meanValue,
+ uint8_t *dst, ptrdiff_t stride) {
+ int pred[33][65];
+ int r, c, colBound;
+ int remainings;
+
+ for (r = 0; r < bs; ++r) {
+ pred[r + 1][0] = (int)left[r] - meanValue;
+ }
+
+ above -= 1;
+ for (c = 0; c < 2 * bs + 1; ++c) {
+ pred[0][c] = (int)above[c] - meanValue;
+ }
+
+ r = 0;
+ c = 0;
+ while (r < bs) {
+ colBound = (bs << 1) - r;
+ for (c = 0; c < colBound; c += 4) {
+ remainings = colBound - c + 1;
+ ProducePixels(&pred[r][c], prm, remainings);
+ }
+ r += 1;
+ }
+
+ SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
+}
+
+static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
+ __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
+ int meanValue = 0;
+ meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
+ GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
+}
+
+void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, V_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, H_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+// ============== High Bit Depth ==============
+#if CONFIG_HIGHBITDEPTH
+static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
+ const uint16_t *left, const int bd,
+ __m128i *params) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+ (void)bd;
+
+ sum_vector = _mm_add_epi16(a, l);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 4;
+ sum_value >>= 3;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
+ const uint16_t *left, const int bd,
+ __m128i *params) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+ (void)bd;
+
+ sum_vector = _mm_add_epi16(a, l);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 8;
+ sum_value >>= 4;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+// Note:
+// Process 16 pixels above and left, 10-bit depth
+// Add to the last 8 pixels sum
+static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
+ __m128i *sum) {
+ __m128i a = _mm_loadu_si128((const __m128i *)above);
+ __m128i l = _mm_loadu_si128((const __m128i *)left);
+ sum[0] = _mm_add_epi16(a, l);
+ a = _mm_loadu_si128((const __m128i *)(above + 8));
+ l = _mm_loadu_si128((const __m128i *)(left + 8));
+ sum[0] = _mm_add_epi16(sum[0], a);
+ sum[0] = _mm_add_epi16(sum[0], l);
+}
+
+// Note:
+// Process 16 pixels above and left, 12-bit depth
+// Add to the last 8 pixels sum
+static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
+ __m128i *sum) {
+ __m128i a = _mm_loadu_si128((const __m128i *)above);
+ __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v0, v1;
+
+ v0 = _mm_unpacklo_epi16(a, zero);
+ v1 = _mm_unpacklo_epi16(l, zero);
+ sum[0] = _mm_add_epi32(v0, v1);
+
+ v0 = _mm_unpackhi_epi16(a, zero);
+ v1 = _mm_unpackhi_epi16(l, zero);
+ sum[0] = _mm_add_epi32(sum[0], v0);
+ sum[0] = _mm_add_epi32(sum[0], v1);
+
+ a = _mm_loadu_si128((const __m128i *)(above + 8));
+ l = _mm_loadu_si128((const __m128i *)(left + 8));
+
+ v0 = _mm_unpacklo_epi16(a, zero);
+ v1 = _mm_unpacklo_epi16(l, zero);
+ sum[0] = _mm_add_epi32(sum[0], v0);
+ sum[0] = _mm_add_epi32(sum[0], v1);
+
+ v0 = _mm_unpackhi_epi16(a, zero);
+ v1 = _mm_unpackhi_epi16(l, zero);
+ sum[0] = _mm_add_epi32(sum[0], v0);
+ sum[0] = _mm_add_epi32(sum[0], v1);
+}
+
+static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
+ const uint16_t *left, const int bd,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint32_t sum_value = 0;
+
+ if (10 == bd) {
+ AddPixels10bit(above, left, &sum_vector);
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ } else if (12 == bd) {
+ AddPixels12bit(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi32(sum_vector, zero);
+ u = _mm_srli_si128(sum_vector, 4);
+ sum_vector = _mm_add_epi32(u, sum_vector);
+ sum_value = _mm_extract_epi32(sum_vector, 0);
+ }
+
+ sum_value += 16;
+ sum_value >>= 5;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
+ const uint16_t *left, const int bd,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector[2], u;
+ uint32_t sum_value = 0;
+
+ if (10 == bd) {
+ AddPixels10bit(above, left, &sum_vector[0]);
+ AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
+
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector[0], 2);
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+ sum_value = _mm_extract_epi16(sum_vector[0], 0);
+ } else if (12 == bd) {
+ AddPixels12bit(above, left, &sum_vector[0]);
+ AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
+
+ sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
+ sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
+ u = _mm_srli_si128(sum_vector[0], 4);
+ sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
+ sum_value = _mm_extract_epi32(sum_vector[0], 0);
+ }
+
+ sum_value += 32;
+ sum_value >>= 6;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+// Note:
+// params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
+ const uint16_t *left, int bs,
+ const int bd, __m128i *params) {
+ int meanValue = 0;
+ switch (bs) {
+ case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
+ case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
+ case 16:
+ meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
+ break;
+ case 32:
+ meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
+ break;
+ default: assert(0);
+ }
+ return meanValue;
+}
+
+// Note:
+// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void HighbdGeneratePrediction(const uint16_t *above,
+ const uint16_t *left, const int bs,
+ const int bd, const __m128i *prm,
+ int meanValue, uint16_t *dst,
+ ptrdiff_t stride) {
+ int pred[33][65];
+ int r, c, colBound;
+ int remainings;
+ int ipred;
+
+ for (r = 0; r < bs; ++r) {
+ pred[r + 1][0] = (int)left[r] - meanValue;
+ }
+
+ above -= 1;
+ for (c = 0; c < 2 * bs + 1; ++c) {
+ pred[0][c] = (int)above[c] - meanValue;
+ }
+
+ r = 0;
+ c = 0;
+ while (r < bs) {
+ colBound = (bs << 1) - r;
+ for (c = 0; c < colBound; c += 4) {
+ remainings = colBound - c + 1;
+ ProducePixels(&pred[r][c], prm, remainings);
+ }
+ r += 1;
+ }
+
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ ipred = pred[r + 1][c + 1] + meanValue;
+ dst[c] = clip_pixel_highbd(ipred, bd);
+ }
+ dst += stride;
+ }
+}
+
+static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
+ int bs, const int bd, __m128i *prm,
+ uint16_t *dst, ptrdiff_t stride) {
+ int meanValue = 0;
+ meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
+ HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
+}
+
+void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, V_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, H_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+ int bs, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+ HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#endif // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 000000000..d10f1ccc2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+// Note:
+// Total 32x4 registers to represent 32x32 block coefficients.
+// For high bit depth, each coefficient is 4-byte.
+// Each __m256i register holds 8 coefficients.
+// So each "row" we needs 4 register. Totally 32 rows
+// Register layout:
+// v0, v1, v2, v3,
+// v4, v5, v6, v7,
+// ... ...
+// v124, v125, v126, v127
+
+static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0], in[4]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+
+ u2 = _mm256_unpacklo_epi32(in[8], in[12]);
+ u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+
+ u4 = _mm256_unpacklo_epi32(in[16], in[20]);
+ u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+
+ u6 = _mm256_unpacklo_epi32(in[24], in[28]);
+ u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
+ transpose_32x32_8x8(&in[0], &out[0]);
+ transpose_32x32_8x8(&in[1], &out[32]);
+ transpose_32x32_8x8(&in[32], &out[1]);
+ transpose_32x32_8x8(&in[33], &out[33]);
+}
+
+static void transpose_32x32(const __m256i *in, __m256i *out) {
+ transpose_32x32_16x16(&in[0], &out[0]);
+ transpose_32x32_16x16(&in[2], &out[64]);
+ transpose_32x32_16x16(&in[64], &out[2]);
+ transpose_32x32_16x16(&in[66], &out[66]);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+ int i;
+ for (i = 0; i < 128; ++i) {
+ in[i] = _mm256_loadu_si256((const __m256i *)coeff);
+ coeff += 8;
+ }
+}
+
+static void round_shift_32x32(__m256i *in, int shift) {
+ __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+ int i = 0;
+
+ while (i < 128) {
+ in[i] = _mm256_add_epi32(in[i], rnding);
+ in[i] = _mm256_srai_epi32(in[i], shift);
+ i++;
+ }
+}
+
+static __m256i highbd_clamp_epi32(__m256i x, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(x, max);
+ clamped = _mm256_andnot_si256(mask, x);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
+ const __m256i zero = _mm256_setzero_si256();
+ int i = 0;
+ (void)fliplr;
+ (void)flipud;
+
+ round_shift_32x32(in, shift);
+
+ while (i < 128) {
+ u0 = _mm256_loadu_si256((const __m256i *)output);
+ u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
+
+ x0 = _mm256_unpacklo_epi16(u0, zero);
+ x1 = _mm256_unpackhi_epi16(u0, zero);
+ x2 = _mm256_unpacklo_epi16(u1, zero);
+ x3 = _mm256_unpackhi_epi16(u1, zero);
+
+ v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
+ v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
+ v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
+ v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
+
+ v0 = _mm256_add_epi32(v0, x0);
+ v1 = _mm256_add_epi32(v1, x1);
+ v2 = _mm256_add_epi32(v2, x2);
+ v3 = _mm256_add_epi32(v3, x3);
+
+ v0 = _mm256_packus_epi32(v0, v1);
+ v2 = _mm256_packus_epi32(v2, v3);
+
+ v0 = highbd_clamp_epi32(v0, bd);
+ v2 = highbd_clamp_epi32(v2, bd);
+
+ _mm256_storeu_si256((__m256i *)output, v0);
+ _mm256_storeu_si256((__m256i *)(output + 16), v2);
+ output += stride;
+ i += 4;
+ }
+}
+
+static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
+ __m256i n1, __m256i rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(w0, n0);
+ y = _mm256_mullo_epi32(w1, n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i bf1[32], bf0[32];
+ int col;
+
+ for (col = 0; col < 4; ++col) {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0 * 4 + col];
+ bf1[1] = in[16 * 4 + col];
+ bf1[2] = in[8 * 4 + col];
+ bf1[3] = in[24 * 4 + col];
+ bf1[4] = in[4 * 4 + col];
+ bf1[5] = in[20 * 4 + col];
+ bf1[6] = in[12 * 4 + col];
+ bf1[7] = in[28 * 4 + col];
+ bf1[8] = in[2 * 4 + col];
+ bf1[9] = in[18 * 4 + col];
+ bf1[10] = in[10 * 4 + col];
+ bf1[11] = in[26 * 4 + col];
+ bf1[12] = in[6 * 4 + col];
+ bf1[13] = in[22 * 4 + col];
+ bf1[14] = in[14 * 4 + col];
+ bf1[15] = in[30 * 4 + col];
+ bf1[16] = in[1 * 4 + col];
+ bf1[17] = in[17 * 4 + col];
+ bf1[18] = in[9 * 4 + col];
+ bf1[19] = in[25 * 4 + col];
+ bf1[20] = in[5 * 4 + col];
+ bf1[21] = in[21 * 4 + col];
+ bf1[22] = in[13 * 4 + col];
+ bf1[23] = in[29 * 4 + col];
+ bf1[24] = in[3 * 4 + col];
+ bf1[25] = in[19 * 4 + col];
+ bf1[26] = in[11 * 4 + col];
+ bf1[27] = in[27 * 4 + col];
+ bf1[28] = in[7 * 4 + col];
+ bf1[29] = in[23 * 4 + col];
+ bf1[30] = in[15 * 4 + col];
+ bf1[31] = in[31 * 4 + col];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
+ bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
+ bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
+ bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
+ bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
+ bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
+ bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
+ bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
+ bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
+ bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
+ bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
+ bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
+ bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
+ bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
+ bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
+ bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
+ bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
+ bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
+ bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
+ bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+ bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
+ bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
+ bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
+ bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
+ bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
+ bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
+ bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
+ bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
+ bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
+ bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
+ bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
+ bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
+ bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
+ bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
+ bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
+ bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
+ bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
+ bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
+ bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+ bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
+ bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
+ bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
+ bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
+ bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
+ bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
+ bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
+ bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+ bf0[16] = bf1[16];
+ bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
+ bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+ bf0[22] =
+ half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
+ bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
+ bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
+ bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
+ bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
+ bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+ bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
+ bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
+ bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
+ bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+ bf1[10] =
+ half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+ bf1[15] = bf0[15];
+ bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
+ bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
+ bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
+ bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
+ bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
+ bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
+ bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
+ bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
+ bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
+ bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
+ bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
+ bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
+ bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
+ bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
+ bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
+ bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+
+ // stage 6
+ bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
+ bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
+ bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
+ bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+ bf0[4] = bf1[4];
+ bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
+ bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+ bf0[7] = bf1[7];
+ bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
+ bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
+ bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
+ bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
+ bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
+ bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
+ bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
+ bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
+ bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+ bf0[20] =
+ half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+ bf0[21] =
+ half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
+ bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
+ bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
+ bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
+ bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
+ bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
+ bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
+ bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
+ bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
+ bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
+ bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
+ bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
+ bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
+ bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
+ bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
+ bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
+ bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
+ bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
+ bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
+ bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
+ bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
+ bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
+ bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
+ bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
+ bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
+ bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
+ bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+
+ // stage 8
+ bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
+ bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
+ bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
+ bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
+ bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
+ bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
+ bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
+ bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
+ bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
+ bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
+ bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
+ bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
+ bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
+ bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
+ bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
+ bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
+ bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
+ bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
+ bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
+ bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
+ bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
+ bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
+ bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
+ out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
+ out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
+ out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
+ out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
+ out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
+ out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
+ out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
+ out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
+ out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
+ out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
+ out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
+ out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
+ out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
+ out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
+ out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
+ out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
+ out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
+ out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
+ out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
+ out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
+ out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
+ out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
+ out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
+ out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
+ out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
+ out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
+ out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
+ out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
+ out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
+ out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
+ out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+ }
+}
+
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
+ int stride, int tx_type, int bd) {
+ __m256i in[128], out[128];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &inv_txfm_2d_cfg_dct_dct_32;
+ load_buffer_32x32(coeff, in);
+ transpose_32x32(in, out);
+ idct32_avx2(out, in, cfg->cos_bit_row[2]);
+ round_shift_32x32(in, -cfg->shift[0]);
+ transpose_32x32(in, out);
+ idct32_avx2(out, in, cfg->cos_bit_col[2]);
+ write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 000000000..24b2760b9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,1398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3, x, y;
+
+ v0 = _mm_unpacklo_epi32(in[0], in[1]);
+ v1 = _mm_unpackhi_epi32(in[0], in[1]);
+ v2 = _mm_unpacklo_epi32(in[2], in[3]);
+ v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ u0 = _mm_unpacklo_epi64(v0, v2);
+ u1 = _mm_unpackhi_epi64(v0, v2);
+ u2 = _mm_unpacklo_epi64(v1, v3);
+ u3 = _mm_unpackhi_epi64(v1, v3);
+
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u2, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u1, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u1, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ in[0] = _mm_add_epi32(v0, v3);
+ in[1] = _mm_add_epi32(v1, v2);
+ in[2] = _mm_sub_epi32(v1, v2);
+ in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3, x, y;
+
+ v0 = _mm_unpacklo_epi32(in[0], in[1]);
+ v1 = _mm_unpackhi_epi32(in[0], in[1]);
+ v2 = _mm_unpacklo_epi32(in[2], in[3]);
+ v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ u0 = _mm_unpacklo_epi64(v0, v2);
+ u1 = _mm_unpackhi_epi64(v0, v2);
+ u2 = _mm_unpacklo_epi64(v1, v3);
+ u3 = _mm_unpackhi_epi64(v1, v3);
+
+ // stage 0
+ // stage 1
+ u1 = _mm_sub_epi32(zero, u1);
+ u3 = _mm_sub_epi32(zero, u3);
+
+ // stage 2
+ v0 = u0;
+ v1 = u3;
+ x = _mm_mullo_epi32(u1, cospi32);
+ y = _mm_mullo_epi32(u2, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+
+ // stage 4
+ x = _mm_mullo_epi32(u0, cospi8);
+ y = _mm_mullo_epi32(u1, cospi56);
+ in[3] = _mm_add_epi32(x, y);
+ in[3] = _mm_add_epi32(in[3], rnding);
+ in[3] = _mm_srai_epi32(in[3], bit);
+
+ x = _mm_mullo_epi32(u0, cospi56);
+ y = _mm_mullo_epi32(u1, cospim8);
+ in[0] = _mm_add_epi32(x, y);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ x = _mm_mullo_epi32(u2, cospi40);
+ y = _mm_mullo_epi32(u3, cospi24);
+ in[1] = _mm_add_epi32(x, y);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[1] = _mm_srai_epi32(in[1], bit);
+
+ x = _mm_mullo_epi32(u2, cospi24);
+ y = _mm_mullo_epi32(u3, cospim40);
+ in[2] = _mm_add_epi32(x, y);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+ __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[3] = _mm_add_epi32(in[3], rnding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ round_shift_4x4(in, shift);
+
+ v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+ v0 = _mm_unpacklo_epi16(v0, zero);
+ v1 = _mm_unpacklo_epi16(v1, zero);
+ v2 = _mm_unpacklo_epi16(v2, zero);
+ v3 = _mm_unpacklo_epi16(v3, zero);
+
+ if (fliplr) {
+ in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+ in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+ in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+ in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+ }
+
+ if (flipud) {
+ u0 = _mm_add_epi32(in[3], v0);
+ u1 = _mm_add_epi32(in[2], v1);
+ u2 = _mm_add_epi32(in[1], v2);
+ u3 = _mm_add_epi32(in[0], v3);
+ } else {
+ u0 = _mm_add_epi32(in[0], v0);
+ u1 = _mm_add_epi32(in[1], v1);
+ u2 = _mm_add_epi32(in[2], v2);
+ u3 = _mm_add_epi32(in[3], v3);
+ }
+
+ v0 = _mm_packus_epi32(u0, u1);
+ v2 = _mm_packus_epi32(u2, u3);
+
+ u0 = highbd_clamp_epi16(v0, bd);
+ u2 = highbd_clamp_epi16(v2, bd);
+
+ v0 = _mm_unpacklo_epi64(u0, u0);
+ v1 = _mm_unpackhi_epi64(u0, u0);
+ v2 = _mm_unpacklo_epi64(u2, u2);
+ v3 = _mm_unpackhi_epi64(u2, u2);
+
+ _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+ _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+ _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+ _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, int tx_type, int bd) {
+ __m128i in[4];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &inv_txfm_2d_cfg_dct_dct_4;
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_4;
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case DCT_ADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_4;
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+ in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+ in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+ in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+ in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+ in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+ in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+ in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+ in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+ in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+ in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+ in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+ in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0 * 2 + col];
+ u1 = in[4 * 2 + col];
+ u2 = in[2 * 2 + col];
+ u3 = in[6 * 2 + col];
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ v4 = _mm_add_epi32(u4, u5);
+ v5 = _mm_sub_epi32(u4, u5);
+ v6 = _mm_sub_epi32(u7, u6);
+ v7 = _mm_add_epi32(u6, u7);
+
+ // stage 4
+ u0 = _mm_add_epi32(v0, v3);
+ u1 = _mm_add_epi32(v1, v2);
+ u2 = _mm_sub_epi32(v1, v2);
+ u3 = _mm_sub_epi32(v0, v3);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+ out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+ out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+ out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+ out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+ out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+ out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+ out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+ }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ u0 = in[2 * 0 + col];
+ u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+ u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+ u3 = in[2 * 4 + col];
+ u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+ u5 = in[2 * 6 + col];
+ u6 = in[2 * 2 + col];
+ u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm_mullo_epi32(u2, cospi32);
+ y = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm_mullo_epi32(u6, cospi32);
+ y = _mm_mullo_epi32(u7, cospi32);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ v7 = _mm_sub_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+ u4 = _mm_add_epi32(v4, v6);
+ u5 = _mm_add_epi32(v5, v7);
+ u6 = _mm_sub_epi32(v4, v6);
+ u7 = _mm_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm_mullo_epi32(u4, cospi16);
+ y = _mm_mullo_epi32(u5, cospi48);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi48);
+ y = _mm_mullo_epi32(u5, cospim16);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospim48);
+ y = _mm_mullo_epi32(u7, cospi16);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi16);
+ y = _mm_mullo_epi32(u7, cospi48);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm_add_epi32(v0, v4);
+ u1 = _mm_add_epi32(v1, v5);
+ u2 = _mm_add_epi32(v2, v6);
+ u3 = _mm_add_epi32(v3, v7);
+ u4 = _mm_sub_epi32(v0, v4);
+ u5 = _mm_sub_epi32(v1, v5);
+ u6 = _mm_sub_epi32(v2, v6);
+ u7 = _mm_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm_mullo_epi32(u0, cospi4);
+ y = _mm_mullo_epi32(u1, cospi60);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ x = _mm_mullo_epi32(u0, cospi60);
+ y = _mm_mullo_epi32(u1, cospim4);
+ v1 = _mm_add_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi20);
+ y = _mm_mullo_epi32(u3, cospi44);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi44);
+ y = _mm_mullo_epi32(u3, cospim20);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ x = _mm_mullo_epi32(u4, cospi36);
+ y = _mm_mullo_epi32(u5, cospi28);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi28);
+ y = _mm_mullo_epi32(u5, cospim36);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospi52);
+ y = _mm_mullo_epi32(u7, cospi12);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi12);
+ y = _mm_mullo_epi32(u7, cospim52);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 7
+ out[2 * 0 + col] = v1;
+ out[2 * 1 + col] = v6;
+ out[2 * 2 + col] = v3;
+ out[2 * 3 + col] = v4;
+ out[2 * 4 + col] = v5;
+ out[2 * 5 + col] = v2;
+ out[2 * 6 + col] = v7;
+ out[2 * 7 + col] = v0;
+ }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+ round_shift_4x4(&in[0], shift);
+ round_shift_4x4(&in[4], shift);
+ round_shift_4x4(&in[8], shift);
+ round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+ int fliplr, int bd) {
+ __m128i x0, x1;
+ const __m128i zero = _mm_setzero_si128();
+
+ x0 = _mm_unpacklo_epi16(pred, zero);
+ x1 = _mm_unpackhi_epi16(pred, zero);
+
+ if (fliplr) {
+ res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+ res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+ x0 = _mm_add_epi32(res_hi, x0);
+ x1 = _mm_add_epi32(res_lo, x1);
+
+ } else {
+ x0 = _mm_add_epi32(res_lo, x0);
+ x1 = _mm_add_epi32(res_hi, x1);
+ }
+
+ x0 = _mm_packus_epi32(x0, x1);
+ return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ round_shift_8x8(in, shift);
+
+ v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+ v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+ v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+ v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+ v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+ v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+ v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+ v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+ if (flipud) {
+ u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+ } else {
+ u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+ }
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+ _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+ _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+ _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+ _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+ _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+ _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+ _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, int tx_type, int bd) {
+ __m128i in[16], out[16];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &inv_txfm_2d_cfg_dct_dct_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case DCT_ADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+ int i;
+ for (i = 0; i < 64; ++i) {
+ in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+ }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+ int col) {
+ int i;
+ for (i = 0; i < 16; i += 2) {
+ in8x8[i] = in[col];
+ in8x8[i + 1] = in[col + 1];
+ col += 4;
+ }
+}
+
+static void swap_addr(uint16_t **output1, uint16_t **output2) {
+ uint16_t *tmp;
+ tmp = *output1;
+ *output1 = *output2;
+ *output2 = tmp;
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i in8x8[16];
+ uint16_t *leftUp = &output[0];
+ uint16_t *rightUp = &output[8];
+ uint16_t *leftDown = &output[8 * stride];
+ uint16_t *rightDown = &output[8 * stride + 8];
+
+ if (fliplr) {
+ swap_addr(&leftUp, &rightUp);
+ swap_addr(&leftDown, &rightDown);
+ }
+
+ if (flipud) {
+ swap_addr(&leftUp, &leftDown);
+ swap_addr(&rightUp, &rightDown);
+ }
+
+ // Left-up quarter
+ assign_8x8_input_from_16x16(in, in8x8, 0);
+ write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+
+ // Right-up quarter
+ assign_8x8_input_from_16x16(in, in8x8, 2);
+ write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+
+ // Left-down quarter
+ assign_8x8_input_from_16x16(in, in8x8, 32);
+ write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+
+ // Right-down quarter
+ assign_8x8_input_from_16x16(in, in8x8, 34);
+ write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < 4; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * 4 + col];
+ u[1] = in[8 * 4 + col];
+ u[2] = in[4 * 4 + col];
+ u[3] = in[12 * 4 + col];
+ u[4] = in[2 * 4 + col];
+ u[5] = in[10 * 4 + col];
+ u[6] = in[6 * 4 + col];
+ u[7] = in[14 * 4 + col];
+ u[8] = in[1 * 4 + col];
+ u[9] = in[9 * 4 + col];
+ u[10] = in[5 * 4 + col];
+ u[11] = in[13 * 4 + col];
+ u[12] = in[3 * 4 + col];
+ u[13] = in[11 * 4 + col];
+ u[14] = in[7 * 4 + col];
+ u[15] = in[15 * 4 + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+ v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+ v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+ v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+ v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+ v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+ v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+ v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+ u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+ u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+ u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+ u[8] = _mm_add_epi32(v[8], v[9]);
+ u[9] = _mm_sub_epi32(v[8], v[9]);
+ u[10] = _mm_sub_epi32(v[11], v[10]);
+ u[11] = _mm_add_epi32(v[10], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[13]);
+ u[13] = _mm_sub_epi32(v[12], v[13]);
+ u[14] = _mm_sub_epi32(v[15], v[14]);
+ u[15] = _mm_add_epi32(v[14], v[15]);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ y = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(x, y);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(x, y);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+ v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+ v[4] = _mm_add_epi32(u[4], u[5]);
+ v[5] = _mm_sub_epi32(u[4], u[5]);
+ v[6] = _mm_sub_epi32(u[7], u[6]);
+ v[7] = _mm_add_epi32(u[6], u[7]);
+ v[8] = u[8];
+ v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+ v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+ v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[3]);
+ u[1] = _mm_add_epi32(v[1], v[2]);
+ u[2] = _mm_sub_epi32(v[1], v[2]);
+ u[3] = _mm_sub_epi32(v[0], v[3]);
+ u[4] = v[4];
+
+ x = _mm_mullo_epi32(v[5], cospi32);
+ y = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm_add_epi32(v[8], v[11]);
+ u[9] = _mm_add_epi32(v[9], v[10]);
+ u[10] = _mm_sub_epi32(v[9], v[10]);
+ u[11] = _mm_sub_epi32(v[8], v[11]);
+ u[12] = _mm_sub_epi32(v[15], v[12]);
+ u[13] = _mm_sub_epi32(v[14], v[13]);
+ u[14] = _mm_add_epi32(v[13], v[14]);
+ u[15] = _mm_add_epi32(v[12], v[15]);
+
+ // stage 6
+ v[0] = _mm_add_epi32(u[0], u[7]);
+ v[1] = _mm_add_epi32(u[1], u[6]);
+ v[2] = _mm_add_epi32(u[2], u[5]);
+ v[3] = _mm_add_epi32(u[3], u[4]);
+ v[4] = _mm_sub_epi32(u[3], u[4]);
+ v[5] = _mm_sub_epi32(u[2], u[5]);
+ v[6] = _mm_sub_epi32(u[1], u[6]);
+ v[7] = _mm_sub_epi32(u[0], u[7]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_sub_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_add_epi32(x, y);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_add_epi32(x, y);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+ out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+ out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+ out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+ out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+ out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+ out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+ out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+ out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+ out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+ out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+ out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+ out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+ out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+ out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+ out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+ }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < 4; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * 4 + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+ u[3] = in[8 * 4 + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+ u[5] = in[12 * 4 + col];
+ u[6] = in[4 * 4 + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+ u[9] = in[14 * 4 + col];
+ u[10] = in[6 * 4 + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+ u[12] = in[2 * 4 + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+ u[15] = in[10 * 4 + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm_mullo_epi32(u[2], cospi32);
+ y = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(x, y);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(x, y);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm_mullo_epi32(u[6], cospi32);
+ y = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(x, y);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(x, y);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(x, y);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(x, y);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm_mullo_epi32(u[14], cospi32);
+ y = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(x, y);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(x, y);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+ v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+ v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+ v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+ v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+ v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+ v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+ v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+ v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+ v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+ v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+ v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+ v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+ v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+ v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+ v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+ v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+ v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+ v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+ v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+ v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+ v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+ v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+ v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+ v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+ v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+ v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+ v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+ v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+ // stage 9
+ out[0 * 4 + col] = v[1];
+ out[1 * 4 + col] = v[14];
+ out[2 * 4 + col] = v[3];
+ out[3 * 4 + col] = v[12];
+ out[4 * 4 + col] = v[5];
+ out[5 * 4 + col] = v[10];
+ out[6 * 4 + col] = v[7];
+ out[7 * 4 + col] = v[8];
+ out[8 * 4 + col] = v[9];
+ out[9 * 4 + col] = v[6];
+ out[10 * 4 + col] = v[11];
+ out[11 * 4 + col] = v[4];
+ out[12 * 4 + col] = v[13];
+ out[13 * 4 + col] = v[2];
+ out[14 * 4 + col] = v[15];
+ out[15 * 4 + col] = v[0];
+ }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+ round_shift_8x8(&in[0], shift);
+ round_shift_8x8(&in[16], shift);
+ round_shift_8x8(&in[32], shift);
+ round_shift_8x8(&in[48], shift);
+}
+
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, int tx_type, int bd) {
+ __m128i in[64], out[64];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &inv_txfm_2d_cfg_dct_dct_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case DCT_ADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &inv_txfm_2d_cfg_adst_dct_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_dct_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(coeff, in);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+ round_shift_16x16(in, -cfg->shift[0]);
+ transpose_16x16(in, out);
+ iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+ write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ break;
+#endif
+ default: assert(0);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 000000000..bc96defe3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ __m128i u0, u1, u2, u3; \
+ u0 = _mm_unpacklo_epi32(x0, x1); \
+ u1 = _mm_unpackhi_epi32(x0, x1); \
+ u2 = _mm_unpacklo_epi32(x2, x3); \
+ u3 = _mm_unpackhi_epi32(x2, x3); \
+ y0 = _mm_unpacklo_epi64(u0, u2); \
+ y1 = _mm_unpackhi_epi64(u0, u2); \
+ y2 = _mm_unpacklo_epi64(u1, u3); \
+ y3 = _mm_unpackhi_epi64(u1, u3); \
+ } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+ TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+ TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+ TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+ TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+ out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+ // Upper left 8x8
+ TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+ TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+ out[28]);
+ TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+ out[13]);
+ TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+ out[29]);
+
+ // Upper right 8x8
+ TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+ out[44]);
+ TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+ out[60]);
+ TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+ out[45]);
+ TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+ out[61]);
+
+ // Lower left 8x8
+ TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+ out[14]);
+ TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+ out[30]);
+ TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+ out[15]);
+ TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+ out[31]);
+ // Lower right 8x8
+ TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+ out[46]);
+ TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+ out[62]);
+ TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+ out[47]);
+ TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+ out[63]);
+}
+
+// Note:
+// rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
+ __m128i n1, __m128i rounding, int bit) {
+ __m128i x, y;
+
+ x = _mm_mullo_epi32(w0, n0);
+ y = _mm_mullo_epi32(w1, n1);
+ x = _mm_add_epi32(x, y);
+ x = _mm_add_epi32(x, rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+#endif // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
new file mode 100644
index 000000000..c25db88b7
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
+ int height, int stride, uint16_t *pred,
+ int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, int bd, int ref_frm,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+ __m128i tmp[15];
+#else
+#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
+#endif
+ int i, j, k;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ // (x, y) coordinates of the center of this block in the destination
+ // image
+ int32_t dst_x = p_col + j + 4;
+ int32_t dst_y = p_row + i + 4;
+
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ } else if (ix4 >= width + 6) {
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ } else {
+ int sx = sx4 + alpha * (-4) + beta * k +
+ // Include rounding and offset here
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ // Load source pixels
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ // Filter even-index pixels
+ __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ __m128i round_const =
+ _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+ // Calculate filtered results
+ __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+ __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+ __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+ __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
+
+ // Filter odd-index pixels
+ __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+ __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+ __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+ __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+ __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ __m128i *src = tmp + (k + 4);
+ __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+ __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+ __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ // Round and pack into 8 bits
+ __m128i round_const =
+ _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+ __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ __m128i zero = _mm_setzero_si128();
+ res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+ // Store, blending with 'pred' if needed
+ __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+ _mm_storel_epi64(p, res_16bit);
+ } else {
+ if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 000000000..efc8d1e24
--- /dev/null
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_HIGHBITDEPTH
+ *in = _mm256_setr_epi16(
+ (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+ (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+ (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+ (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+ (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+ (int16_t)coeff[15]);
+#else
+ *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+ int i = 0;
+ while (i < 16) {
+ load_coeff(coeff + (i << 4), &in[i]);
+ i += 1;
+ }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x = _mm_loadu_si128((__m128i const *)output);
+ __m128i p0 = _mm_unpacklo_epi8(x, zero);
+ __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+ p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+ p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+ x = _mm_packus_epi16(p0, p1);
+ _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+ const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_add_epi16(in[i], rounding);
+ in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+ recon_and_store(&in[i], output + i * stride);
+ i += 1;
+ }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b0, __m256i *b1) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ *b0 = butter_fly(x0, x1, *c0);
+ *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+ const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+ const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+ const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+ const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ // stage 1, (0-7)
+ u0 = in[0];
+ u1 = in[8];
+ u2 = in[4];
+ u3 = in[12];
+ u4 = in[2];
+ u5 = in[10];
+ u6 = in[6];
+ u7 = in[14];
+
+ // stage 2, (0-7)
+ // stage 3, (0-7)
+ t0 = u0;
+ t1 = u1;
+ t2 = u2;
+ t3 = u3;
+ unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+ unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+ // stage 4, (0-7)
+ unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+ unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+ u4 = _mm256_add_epi16(t4, t5);
+ u5 = _mm256_sub_epi16(t4, t5);
+ u6 = _mm256_sub_epi16(t7, t6);
+ u7 = _mm256_add_epi16(t7, t6);
+
+ // stage 5, (0-7)
+ t0 = _mm256_add_epi16(u0, u3);
+ t1 = _mm256_add_epi16(u1, u2);
+ t2 = _mm256_sub_epi16(u1, u2);
+ t3 = _mm256_sub_epi16(u0, u3);
+ t4 = u4;
+ t7 = u7;
+ unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+ // stage 6, (0-7)
+ u0 = _mm256_add_epi16(t0, t7);
+ u1 = _mm256_add_epi16(t1, t6);
+ u2 = _mm256_add_epi16(t2, t5);
+ u3 = _mm256_add_epi16(t3, t4);
+ u4 = _mm256_sub_epi16(t3, t4);
+ u5 = _mm256_sub_epi16(t2, t5);
+ u6 = _mm256_sub_epi16(t1, t6);
+ u7 = _mm256_sub_epi16(t0, t7);
+
+ // stage 1, (8-15)
+ v0 = in[1];
+ v1 = in[9];
+ v2 = in[5];
+ v3 = in[13];
+ v4 = in[3];
+ v5 = in[11];
+ v6 = in[7];
+ v7 = in[15];
+
+ // stage 2, (8-15)
+ unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+ unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+ unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+ // stage 3, (8-15)
+ v0 = _mm256_add_epi16(t0, t1);
+ v1 = _mm256_sub_epi16(t0, t1);
+ v2 = _mm256_sub_epi16(t3, t2);
+ v3 = _mm256_add_epi16(t2, t3);
+ v4 = _mm256_add_epi16(t4, t5);
+ v5 = _mm256_sub_epi16(t4, t5);
+ v6 = _mm256_sub_epi16(t7, t6);
+ v7 = _mm256_add_epi16(t6, t7);
+
+ // stage 4, (8-15)
+ t0 = v0;
+ t7 = v7;
+ t3 = v3;
+ t4 = v4;
+ unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+ // stage 5, (8-15)
+ v0 = _mm256_add_epi16(t0, t3);
+ v1 = _mm256_add_epi16(t1, t2);
+ v2 = _mm256_sub_epi16(t1, t2);
+ v3 = _mm256_sub_epi16(t0, t3);
+ v4 = _mm256_sub_epi16(t7, t4);
+ v5 = _mm256_sub_epi16(t6, t5);
+ v6 = _mm256_add_epi16(t6, t5);
+ v7 = _mm256_add_epi16(t7, t4);
+
+ // stage 6, (8-15)
+ t0 = v0;
+ t1 = v1;
+ t6 = v6;
+ t7 = v7;
+ unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+ unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+ // stage 7
+ in[0] = _mm256_add_epi16(u0, t7);
+ in[1] = _mm256_add_epi16(u1, t6);
+ in[2] = _mm256_add_epi16(u2, t5);
+ in[3] = _mm256_add_epi16(u3, t4);
+ in[4] = _mm256_add_epi16(u4, t3);
+ in[5] = _mm256_add_epi16(u5, t2);
+ in[6] = _mm256_add_epi16(u6, t1);
+ in[7] = _mm256_add_epi16(u7, t0);
+ in[8] = _mm256_sub_epi16(u7, t0);
+ in[9] = _mm256_sub_epi16(u6, t1);
+ in[10] = _mm256_sub_epi16(u5, t2);
+ in[11] = _mm256_sub_epi16(u4, t3);
+ in[12] = _mm256_sub_epi16(u3, t4);
+ in[13] = _mm256_sub_epi16(u2, t5);
+ in[14] = _mm256_sub_epi16(u1, t6);
+ in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ b[0] = _mm256_madd_epi16(x0, *c0);
+ b[1] = _mm256_madd_epi16(x1, *c0);
+ b[2] = _mm256_madd_epi16(x0, *c1);
+ b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ int i;
+ for (i = 0; i < num; ++i) {
+ a[i] = _mm256_add_epi32(a[i], dct_rounding);
+ a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+ }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_add_epi32(a[0], b[0]);
+ x[1] = _mm256_add_epi32(a[1], b[1]);
+ x[2] = _mm256_add_epi32(a[2], b[2]);
+ x[3] = _mm256_add_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_sub_epi32(a[0], b[0]);
+ x[1] = _mm256_sub_epi32(a[1], b[1]);
+ x[2] = _mm256_sub_epi32(a[2], b[2]);
+ x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+ group_rounding(a, 4);
+ out[0] = _mm256_packs_epi32(a[0], a[1]);
+ out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+ const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+ const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+ const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+ const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+ const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+ const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+ const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+ const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+ const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i x[16], s[16];
+ __m256i u[4], v[4];
+
+ // stage 1
+ butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+ butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+ add_rnd(u, v, &x[0]);
+ sub_rnd(u, v, &x[8]);
+
+ butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+ butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+ add_rnd(u, v, &x[2]);
+ sub_rnd(u, v, &x[10]);
+
+ butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+ butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[12]);
+
+ butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+ butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+ add_rnd(u, v, &x[6]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 2
+ s[0] = _mm256_add_epi16(x[0], x[4]);
+ s[1] = _mm256_add_epi16(x[1], x[5]);
+ s[2] = _mm256_add_epi16(x[2], x[6]);
+ s[3] = _mm256_add_epi16(x[3], x[7]);
+ s[4] = _mm256_sub_epi16(x[0], x[4]);
+ s[5] = _mm256_sub_epi16(x[1], x[5]);
+ s[6] = _mm256_sub_epi16(x[2], x[6]);
+ s[7] = _mm256_sub_epi16(x[3], x[7]);
+ butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+ butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+ add_rnd(u, v, &s[8]);
+ sub_rnd(u, v, &s[12]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+ add_rnd(u, v, &s[10]);
+ sub_rnd(u, v, &s[14]);
+
+ // stage 3
+ x[0] = _mm256_add_epi16(s[0], s[2]);
+ x[1] = _mm256_add_epi16(s[1], s[3]);
+ x[2] = _mm256_sub_epi16(s[0], s[2]);
+ x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+ x[8] = _mm256_add_epi16(s[8], s[10]);
+ x[9] = _mm256_add_epi16(s[9], s[11]);
+ x[10] = _mm256_sub_epi16(s[8], s[10]);
+ x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+ butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[6]);
+
+ butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[12]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 4
+ butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+ butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+ butterfly_rnd(u, &x[2]);
+ butterfly_rnd(v, &x[6]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+ butterfly_rnd(u, &x[10]);
+ butterfly_rnd(v, &x[14]);
+
+ in[0] = x[0];
+ in[1] = _mm256_sub_epi16(zero, x[8]);
+ in[2] = x[12];
+ in[3] = _mm256_sub_epi16(zero, x[4]);
+ in[4] = x[6];
+ in[5] = x[14];
+ in[6] = x[10];
+ in[7] = x[2];
+ in[8] = x[3];
+ in[9] = x[11];
+ in[10] = x[15];
+ in[11] = x[7];
+ in[12] = x[5];
+ in[13] = _mm256_sub_epi16(zero, x[13]);
+ in[14] = x[9];
+ in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ mm256_reverse_epi16(&in[i]);
+ }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+ *dest = *dest + (rows - 1) * (*stride);
+ *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m256i in[16];
+
+ load_buffer_16x16(input, in);
+ switch (tx_type) {
+ case DCT_DCT:
+ idct16(in);
+ idct16(in);
+ break;
+ case ADST_DCT:
+ idct16(in);
+ iadst16(in);
+ break;
+ case DCT_ADST:
+ iadst16(in);
+ idct16(in);
+ break;
+ case ADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ idct16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case DCT_FLIPADST:
+ iadst16(in);
+ idct16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ flip_col(&dest, &stride, 16);
+ break;
+ case ADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case IDTX:
+ iidtx16(in);
+ iidtx16(in);
+ break;
+ case V_DCT:
+ iidtx16(in);
+ idct16(in);
+ break;
+ case H_DCT:
+ idct16(in);
+ iidtx16(in);
+ break;
+ case V_ADST:
+ iidtx16(in);
+ iadst16(in);
+ break;
+ case H_ADST:
+ iadst16(in);
+ iidtx16(in);
+ break;
+ case V_FLIPADST:
+ iidtx16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case H_FLIPADST:
+ iadst16(in);
+ iidtx16(in);
+ flip_row(in, 16);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ write_buffer_16x16(in, stride, dest);
+}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
new file mode 100644
index 000000000..522e8988c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -0,0 +1,1402 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#if CONFIG_EXT_TX
+static INLINE void fliplr_4x4(__m128i in[2]) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+ fliplr_8x8(&in[0]);
+ fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) \
+ do { \
+ __m128i *tmp; \
+ fliplr_16x8(in0); \
+ fliplr_16x8(in1); \
+ tmp = (in0); \
+ (in0) = (in1); \
+ (in1) = tmp; \
+ } while (0)
+
+#define FLIPUD_PTR(dest, stride, size) \
+ do { \
+ (dest) = (dest) + ((size)-1) * (stride); \
+ (stride) = -(stride); \
+ } while (0)
+#endif
+
+void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[2];
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ aom_idct4_sse2(in);
+ aom_idct4_sse2(in);
+ break;
+ case ADST_DCT:
+ aom_idct4_sse2(in);
+ aom_iadst4_sse2(in);
+ break;
+ case DCT_ADST:
+ aom_iadst4_sse2(in);
+ aom_idct4_sse2(in);
+ break;
+ case ADST_ADST:
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ aom_idct4_sse2(in);
+ aom_iadst4_sse2(in);
+ FLIPUD_PTR(dest, stride, 4);
+ break;
+ case DCT_FLIPADST:
+ aom_iadst4_sse2(in);
+ aom_idct4_sse2(in);
+ fliplr_4x4(in);
+ break;
+ case FLIPADST_FLIPADST:
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ FLIPUD_PTR(dest, stride, 4);
+ fliplr_4x4(in);
+ break;
+ case ADST_FLIPADST:
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ fliplr_4x4(in);
+ break;
+ case FLIPADST_ADST:
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ FLIPUD_PTR(dest, stride, 4);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+
+ // Final round and shift
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+ __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d0 = _mm_unpacklo_epi32(d0, d1);
+ d2 = _mm_unpacklo_epi32(d2, d3);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, in[0]);
+ d2 = _mm_add_epi16(d2, in[1]);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store result[0]
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store result[1]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store result[2]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ // store result[3]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ }
+}
+
+void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[8];
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+ // load input data
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8 * 1);
+ in[2] = load_input_data(input + 8 * 2);
+ in[3] = load_input_data(input + 8 * 3);
+ in[4] = load_input_data(input + 8 * 4);
+ in[5] = load_input_data(input + 8 * 5);
+ in[6] = load_input_data(input + 8 * 6);
+ in[7] = load_input_data(input + 8 * 7);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ aom_idct8_sse2(in);
+ aom_idct8_sse2(in);
+ break;
+ case ADST_DCT:
+ aom_idct8_sse2(in);
+ aom_iadst8_sse2(in);
+ break;
+ case DCT_ADST:
+ aom_iadst8_sse2(in);
+ aom_idct8_sse2(in);
+ break;
+ case ADST_ADST:
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ aom_idct8_sse2(in);
+ aom_iadst8_sse2(in);
+ FLIPUD_PTR(dest, stride, 8);
+ break;
+ case DCT_FLIPADST:
+ aom_iadst8_sse2(in);
+ aom_idct8_sse2(in);
+ fliplr_8x8(in);
+ break;
+ case FLIPADST_FLIPADST:
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ FLIPUD_PTR(dest, stride, 8);
+ fliplr_8x8(in);
+ break;
+ case ADST_FLIPADST:
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ fliplr_8x8(in);
+ break;
+ case FLIPADST_ADST:
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ FLIPUD_PTR(dest, stride, 8);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 5);
+ in[1] = _mm_srai_epi16(in[1], 5);
+ in[2] = _mm_srai_epi16(in[2], 5);
+ in[3] = _mm_srai_epi16(in[3], 5);
+ in[4] = _mm_srai_epi16(in[4], 5);
+ in[5] = _mm_srai_epi16(in[5], 5);
+ in[6] = _mm_srai_epi16(in[6], 5);
+ in[7] = _mm_srai_epi16(in[7], 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ idtx16_8col(in0);
+ idtx16_8col(in1);
+}
+#endif // CONFIG_EXT_TX
+
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in[32];
+ __m128i *in0 = &in[0];
+ __m128i *in1 = &in[16];
+
+ load_buffer_8x16(input, in0);
+ input += 8;
+ load_buffer_8x16(input, in1);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case DCT_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case DCT_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case ADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case IDTX:
+ iidtx16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_DCT:
+ iidtx16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case H_DCT:
+ aom_idct16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_ADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case H_ADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_FLIPADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case H_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+
+ write_buffer_8x16(dest, in0, stride);
+ dest += 8;
+ write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx8_sse2(__m128i *in) {
+ in[0] = _mm_slli_epi16(in[0], 1);
+ in[1] = _mm_slli_epi16(in[1], 1);
+ in[2] = _mm_slli_epi16(in[2], 1);
+ in[3] = _mm_slli_epi16(in[3], 1);
+ in[4] = _mm_slli_epi16(in[4], 1);
+ in[5] = _mm_slli_epi16(in[5], 1);
+ in[6] = _mm_slli_epi16(in[6], 1);
+ in[7] = _mm_slli_epi16(in[7], 1);
+}
+
+static INLINE void iidtx4_sse2(__m128i *in) {
+ const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+ const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+ const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+ const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+ const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+
+ const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+ const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+ const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+
+ in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+ in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+}
+
+// load 8x8 array
+static INLINE void flip_buffer_lr_8x8(__m128i *in) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+}
+#endif // CONFIG_EXT_TX
+
+void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in[16];
+
+ in[0] = load_input_data(input + 0 * 8);
+ in[1] = load_input_data(input + 1 * 8);
+ in[2] = load_input_data(input + 2 * 8);
+ in[3] = load_input_data(input + 3 * 8);
+ in[4] = load_input_data(input + 4 * 8);
+ in[5] = load_input_data(input + 5 * 8);
+ in[6] = load_input_data(input + 6 * 8);
+ in[7] = load_input_data(input + 7 * 8);
+
+ in[8] = load_input_data(input + 8 * 8);
+ in[9] = load_input_data(input + 9 * 8);
+ in[10] = load_input_data(input + 10 * 8);
+ in[11] = load_input_data(input + 11 * 8);
+ in[12] = load_input_data(input + 12 * 8);
+ in[13] = load_input_data(input + 13 * 8);
+ in[14] = load_input_data(input + 14 * 8);
+ in[15] = load_input_data(input + 15 * 8);
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ aom_idct8_sse2(in);
+ array_transpose_8x8(in, in);
+ aom_idct8_sse2(in + 8);
+ array_transpose_8x8(in + 8, in + 8);
+ break;
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST:
+#endif
+ aom_iadst8_sse2(in);
+ array_transpose_8x8(in, in);
+ aom_iadst8_sse2(in + 8);
+ array_transpose_8x8(in + 8, in + 8);
+ break;
+#if CONFIG_EXT_TX
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+ iidtx8_sse2(in);
+ iidtx8_sse2(in + 8);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ scale_sqrt2_8x8(in);
+ scale_sqrt2_8x8(in + 8);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ idct16_8col(in);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ iadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX: idtx16_8col(in); break;
+#endif
+ default: assert(0); break;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case H_DCT:
+#endif
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+#endif
+ write_buffer_8x16(dest, in, stride);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ flip_buffer_lr_8x8(in);
+ flip_buffer_lr_8x8(in + 8);
+ write_buffer_8x16(dest, in, stride);
+ break;
+ case FLIPADST_FLIPADST:
+ flip_buffer_lr_8x8(in);
+ flip_buffer_lr_8x8(in + 8);
+ write_buffer_8x16(dest + stride * 15, in, -stride);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+
+static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
+ int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in[16];
+
+ // Transpose 16x8 input into in[]
+ in[0] = load_input_data(input + 0 * 16);
+ in[1] = load_input_data(input + 1 * 16);
+ in[2] = load_input_data(input + 2 * 16);
+ in[3] = load_input_data(input + 3 * 16);
+ in[4] = load_input_data(input + 4 * 16);
+ in[5] = load_input_data(input + 5 * 16);
+ in[6] = load_input_data(input + 6 * 16);
+ in[7] = load_input_data(input + 7 * 16);
+ array_transpose_8x8(in, in);
+
+ in[8] = load_input_data(input + 8 + 0 * 16);
+ in[9] = load_input_data(input + 8 + 1 * 16);
+ in[10] = load_input_data(input + 8 + 2 * 16);
+ in[11] = load_input_data(input + 8 + 3 * 16);
+ in[12] = load_input_data(input + 8 + 4 * 16);
+ in[13] = load_input_data(input + 8 + 5 * 16);
+ in[14] = load_input_data(input + 8 + 6 * 16);
+ in[15] = load_input_data(input + 8 + 7 * 16);
+ array_transpose_8x8(in + 8, in + 8);
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ idct16_8col(in);
+ break;
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST:
+#endif
+ iadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX: idtx16_8col(in); break;
+#endif
+ default: assert(0); break;
+ }
+
+ // Scale
+ scale_sqrt2_8x8(in);
+ scale_sqrt2_8x8(in + 8);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ aom_idct8_sse2(in);
+ aom_idct8_sse2(in + 8);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in + 8);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX:
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ iidtx8_sse2(in);
+ iidtx8_sse2(in + 8);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+#endif
+ write_buffer_8x8_round6(dest, in, stride);
+ write_buffer_8x8_round6(dest + 8, in + 8, stride);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST:
+ write_buffer_8x8_round6(dest + stride * 7, in, -stride);
+ write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
+ break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ flip_buffer_lr_8x8(in);
+ flip_buffer_lr_8x8(in + 8);
+ write_buffer_8x8_round6(dest, in + 8, stride);
+ write_buffer_8x8_round6(dest + 8, in, stride);
+ break;
+ case FLIPADST_FLIPADST:
+ flip_buffer_lr_8x8(in);
+ flip_buffer_lr_8x8(in + 8);
+ write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
+ write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+
+static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
+ int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 5);
+ in[1] = _mm_srai_epi16(in[1], 5);
+ in[2] = _mm_srai_epi16(in[2], 5);
+ in[3] = _mm_srai_epi16(in[3], 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+}
+
+void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ in[0] = load_input_data(input + 0 * 8);
+ in[1] = load_input_data(input + 1 * 8);
+ in[2] = load_input_data(input + 2 * 8);
+ in[3] = load_input_data(input + 3 * 8);
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ aom_idct8_sse2(in);
+ break;
+ case DCT_ADST:
+ case ADST_ADST: aom_iadst8_sse2(in); break;
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST: aom_iadst8_sse2(in); break;
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
+ break;
+ default: assert(0); break;
+ }
+
+ scale_sqrt2_8x8(in);
+
+ // Repack data. We pack into the bottom half of 'in'
+ // so that the next repacking stage can pack into the
+ // top half without overwriting anything
+ in[7] = _mm_unpacklo_epi64(in[6], in[7]);
+ in[6] = _mm_unpacklo_epi64(in[4], in[5]);
+ in[5] = _mm_unpacklo_epi64(in[2], in[3]);
+ in[4] = _mm_unpacklo_epi64(in[0], in[1]);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ aom_idct4_sse2(in + 4);
+ aom_idct4_sse2(in + 6);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ aom_iadst4_sse2(in + 4);
+ aom_iadst4_sse2(in + 6);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX:
+ iidtx4_sse2(in + 4);
+ array_transpose_4x4(in + 4);
+ iidtx4_sse2(in + 6);
+ array_transpose_4x4(in + 6);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ // Repack data
+ in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+ in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+ in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+ in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX: break;
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ break;
+ case FLIPADST_FLIPADST:
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ FLIPUD_PTR(dest, stride, 4);
+#endif
+ break;
+ default: assert(0); break;
+ }
+ write_buffer_8x4_round5(dest, in, stride);
+}
+
+static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
+ int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 5);
+ in[1] = _mm_srai_epi16(in[1], 5);
+ in[2] = _mm_srai_epi16(in[2], 5);
+ in[3] = _mm_srai_epi16(in[3], 5);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+ __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
+ __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
+ __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
+ __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
+
+ d0 = _mm_unpacklo_epi32(d0, d1);
+ d2 = _mm_unpacklo_epi32(d2, d3);
+ d4 = _mm_unpacklo_epi32(d4, d5);
+ d6 = _mm_unpacklo_epi32(d6, d7);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d4 = _mm_unpacklo_epi8(d4, zero);
+ d6 = _mm_unpacklo_epi8(d6, zero);
+ d0 = _mm_add_epi16(d0, in[0]);
+ d2 = _mm_add_epi16(d2, in[1]);
+ d4 = _mm_add_epi16(d4, in[2]);
+ d6 = _mm_add_epi16(d6, in[3]);
+
+ d0 = _mm_packus_epi16(d0, d2);
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_packus_epi16(d4, d6);
+ *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
+ }
+}
+
+void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ // Load rows, packed two per element of 'in'.
+ // We pack into the bottom half of 'in' so that the
+ // later repacking stage can pack into the
+ // top half without overwriting anything
+ in[4] = load_input_data(input + 0 * 8);
+ in[5] = load_input_data(input + 1 * 8);
+ in[6] = load_input_data(input + 2 * 8);
+ in[7] = load_input_data(input + 3 * 8);
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ aom_idct4_sse2(in + 4);
+ aom_idct4_sse2(in + 6);
+ break;
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST:
+#endif
+ aom_iadst4_sse2(in + 4);
+ aom_iadst4_sse2(in + 6);
+ break;
+#if CONFIG_EXT_TX
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+ iidtx4_sse2(in + 4);
+ array_transpose_4x4(in + 4);
+ iidtx4_sse2(in + 6);
+ array_transpose_4x4(in + 6);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ scale_sqrt2_8x4(in + 4);
+
+ // Repack data
+ in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+ in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+ in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+ in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ aom_idct8_sse2(in);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ aom_iadst8_sse2(in);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX:
+ iidtx8_sse2(in);
+ array_transpose_8x8(in, in);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+ in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+ in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+ in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+ break;
+ case FLIPADST_FLIPADST:
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+ in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+ in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+ in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+ FLIPUD_PTR(dest, stride, 8);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ in[0] = _mm_unpacklo_epi64(in[0], in[1]);
+ in[1] = _mm_unpacklo_epi64(in[2], in[3]);
+ in[2] = _mm_unpacklo_epi64(in[4], in[5]);
+ in[3] = _mm_unpacklo_epi64(in[6], in[7]);
+ write_buffer_4x8_round5(dest, in, stride);
+}
+
+// Note: The 16-column 32-element transforms take input in the form of four
+// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
+// of the overall 16x32 input buffer.
+static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ array_transpose_16x16(tl, tr);
+ array_transpose_16x16(bl, br);
+ idct32_8col(tl, bl);
+ idct32_8col(tr, br);
+}
+
+static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ __m128i tmpl[16], tmpr[16];
+ int i;
+
+ // Copy the top half of the input to temporary storage
+ for (i = 0; i < 16; ++i) {
+ tmpl[i] = tl[i];
+ tmpr[i] = tr[i];
+ }
+
+ // Generate the top half of the output
+ for (i = 0; i < 16; ++i) {
+ tl[i] = _mm_slli_epi16(bl[i], 2);
+ tr[i] = _mm_slli_epi16(br[i], 2);
+ }
+ array_transpose_16x16(tl, tr);
+
+ // Copy the temporary storage back to the bottom half of the input
+ for (i = 0; i < 16; ++i) {
+ bl[i] = tmpl[i];
+ br[i] = tmpr[i];
+ }
+
+ // Generate the bottom half of the output
+ scale_sqrt2_8x16(bl);
+ scale_sqrt2_8x16(br);
+ aom_idct16_sse2(bl, br); // Includes a transposition
+}
+
+#if CONFIG_EXT_TX
+static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ int i;
+ array_transpose_16x16(tl, tr);
+ array_transpose_16x16(bl, br);
+ for (i = 0; i < 16; ++i) {
+ tl[i] = _mm_slli_epi16(tl[i], 2);
+ tr[i] = _mm_slli_epi16(tr[i], 2);
+ bl[i] = _mm_slli_epi16(bl[i], 2);
+ br[i] = _mm_slli_epi16(br[i], 2);
+ }
+}
+#endif // CONFIG_EXT_TX
+
+static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
+ __m128i *intr, __m128i *inbl,
+ __m128i *inbr, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ intl[i] = _mm_adds_epi16(intl[i], final_rounding);
+ intr[i] = _mm_adds_epi16(intr[i], final_rounding);
+ inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
+ inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
+ intl[i] = _mm_srai_epi16(intl[i], 6);
+ intr[i] = _mm_srai_epi16(intr[i], 6);
+ inbl[i] = _mm_srai_epi16(inbl[i], 6);
+ inbr[i] = _mm_srai_epi16(inbr[i], 6);
+ RECON_AND_STORE(dest + i * stride + 0, intl[i]);
+ RECON_AND_STORE(dest + i * stride + 8, intr[i]);
+ RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
+ RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
+ }
+}
+
+void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ intl[i] = load_input_data(input + i * 16 + 0);
+ intr[i] = load_input_data(input + i * 16 + 8);
+ inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
+ inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
+ }
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ aom_idct16_sse2(intl, intr);
+ aom_idct16_sse2(inbl, inbr);
+ break;
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST:
+#endif
+ aom_iadst16_sse2(intl, intr);
+ aom_iadst16_sse2(inbl, inbr);
+ break;
+#if CONFIG_EXT_TX
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+ iidtx16_sse2(intl, intr);
+ iidtx16_sse2(inbl, inbr);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ scale_sqrt2_8x16(intl);
+ scale_sqrt2_8x16(intr);
+ scale_sqrt2_8x16(inbl);
+ scale_sqrt2_8x16(inbr);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ idct32_16col(intl, intr, inbl, inbr);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ ihalfright32_16col(intl, intr, inbl, inbr);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
+ default: assert(0); break;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp = intl[i];
+ intl[i] = mm_reverse_epi16(intr[i]);
+ intr[i] = mm_reverse_epi16(tmp);
+ tmp = inbl[i];
+ inbl[i] = mm_reverse_epi16(inbr[i]);
+ inbr[i] = mm_reverse_epi16(tmp);
+ }
+ break;
+ case FLIPADST_FLIPADST:
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp = intl[i];
+ intl[i] = mm_reverse_epi16(intr[i]);
+ intr[i] = mm_reverse_epi16(tmp);
+ tmp = inbl[i];
+ inbl[i] = mm_reverse_epi16(inbr[i]);
+ inbr[i] = mm_reverse_epi16(tmp);
+ }
+ FLIPUD_PTR(dest, stride, 32);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
+}
+
+static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
+ __m128i *in1, __m128i *in2,
+ __m128i *in3, int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ in0[i] = _mm_adds_epi16(in0[i], final_rounding);
+ in1[i] = _mm_adds_epi16(in1[i], final_rounding);
+ in2[i] = _mm_adds_epi16(in2[i], final_rounding);
+ in3[i] = _mm_adds_epi16(in3[i], final_rounding);
+ in0[i] = _mm_srai_epi16(in0[i], 6);
+ in1[i] = _mm_srai_epi16(in1[i], 6);
+ in2[i] = _mm_srai_epi16(in2[i], 6);
+ in3[i] = _mm_srai_epi16(in3[i], 6);
+ RECON_AND_STORE(dest + i * stride + 0, in0[i]);
+ RECON_AND_STORE(dest + i * stride + 8, in1[i]);
+ RECON_AND_STORE(dest + i * stride + 16, in2[i]);
+ RECON_AND_STORE(dest + i * stride + 24, in3[i]);
+ }
+}
+
+void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in0[16], in1[16], in2[16], in3[16];
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ in0[i] = load_input_data(input + i * 32 + 0);
+ in1[i] = load_input_data(input + i * 32 + 8);
+ in2[i] = load_input_data(input + i * 32 + 16);
+ in3[i] = load_input_data(input + i * 32 + 24);
+ }
+
+ // Row transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case H_DCT:
+#endif
+ idct32_16col(in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case H_ADST:
+ case H_FLIPADST:
+#endif
+ ihalfright32_16col(in0, in1, in2, in3);
+ break;
+#if CONFIG_EXT_TX
+ case V_FLIPADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
+ default: assert(0); break;
+ }
+
+ scale_sqrt2_8x16(in0);
+ scale_sqrt2_8x16(in1);
+ scale_sqrt2_8x16(in2);
+ scale_sqrt2_8x16(in3);
+
+ // Column transform
+ switch (tx_type) {
+ case DCT_DCT:
+ case DCT_ADST:
+#if CONFIG_EXT_TX
+ case DCT_FLIPADST:
+ case V_DCT:
+#endif
+ aom_idct16_sse2(in0, in1);
+ aom_idct16_sse2(in2, in3);
+ break;
+ case ADST_DCT:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case FLIPADST_ADST:
+ case ADST_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case FLIPADST_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+#endif
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in2, in3);
+ break;
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case IDTX:
+ iidtx16_sse2(in0, in1);
+ iidtx16_sse2(in2, in3);
+ break;
+#endif
+ default: assert(0); break;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
+ case H_ADST:
+ case V_ADST:
+ case V_DCT:
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp1 = in0[i];
+ __m128i tmp2 = in1[i];
+ in0[i] = mm_reverse_epi16(in3[i]);
+ in1[i] = mm_reverse_epi16(in2[i]);
+ in2[i] = mm_reverse_epi16(tmp2);
+ in3[i] = mm_reverse_epi16(tmp1);
+ }
+ break;
+ case FLIPADST_FLIPADST:
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp1 = in0[i];
+ __m128i tmp2 = in1[i];
+ in0[i] = mm_reverse_epi16(in3[i]);
+ in1[i] = mm_reverse_epi16(in2[i]);
+ in2[i] = mm_reverse_epi16(tmp2);
+ in3[i] = mm_reverse_epi16(tmp1);
+ }
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
new file mode 100644
index 000000000..b3ed9efdf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <float.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/pvq_sse4.h"
+#include "../odintrin.h"
+#include "av1/common/pvq.h"
+
+#define EPSILON 1e-15f
+
+static __m128 horizontal_sum_ps(__m128 x) {
+ x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
+ x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
+ return x;
+}
+
+static __m128i horizontal_sum_epi32(__m128i x) {
+ x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+ x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
+ return x;
+}
+
+static INLINE float rsqrtf(float x) {
+ float y;
+ _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
+ return y;
+}
+
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. This is a float-precision PVQ search just to make sure
+ * our tests aren't limited by numerical accuracy. It's close to the
+ * pvq_search_rdo_double_c implementation, but is not bit accurate and
+ * it performs slightly worse on PSNR. One reason is that this code runs
+ * more RDO iterations than the C code. It also uses single precision
+ * floating point math, whereas the C version uses double precision.
+ *
+ * @param [in] xcoeff input vector to quantize (x in the math doc)
+ * @param [in] n number of dimensions
+ * @param [in] k number of pulses
+ * @param [out] ypulse optimal codevector found (y in the math doc)
+ * @param [in] g2 multiplier for the distortion (typically squared
+ * gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in] prev_k number of pulses already in ypulse that we should
+ * reuse for the search (or 0 for a new search)
+ * @return cosine distance between x and y (between 0 and 1)
+ */
+double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
+ int *ypulse, double g2,
+ double pvq_norm_lambda, int prev_k) {
+ int i, j;
+ int reuse_pulses = prev_k > 0 && prev_k <= k;
+ /* TODO - This blows our 8kB stack space budget and should be fixed when
+ converting PVQ to fixed point. */
+ float xx = 0, xy = 0, yy = 0;
+ float x[MAXN + 3];
+ float y[MAXN + 3];
+ float sign_y[MAXN + 3];
+ for (i = 0; i < n; i++) {
+ float tmp = (float)xcoeff[i];
+ xx += tmp * tmp;
+ x[i] = xcoeff[i];
+ }
+
+ x[n] = x[n + 1] = x[n + 2] = 0;
+ ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
+
+ __m128 sums = _mm_setzero_ps();
+ for (i = 0; i < n; i += 4) {
+ __m128 x4 = _mm_loadu_ps(&x[i]);
+ __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+ /* Save the sign, we'll put it back later. */
+ _mm_storeu_ps(&sign_y[i], s4);
+ /* Get rid of the sign. */
+ x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
+ sums = _mm_add_ps(sums, x4);
+ if (!reuse_pulses) {
+ /* Clear y and ypulse in case we don't do the projection. */
+ _mm_storeu_ps(&y[i], _mm_setzero_ps());
+ _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
+ }
+ _mm_storeu_ps(&x[i], x4);
+ }
+ sums = horizontal_sum_ps(sums);
+ int pulses_left = k;
+ {
+ __m128i pulses_sum;
+ __m128 yy4, xy4;
+ xy4 = yy4 = _mm_setzero_ps();
+ pulses_sum = _mm_setzero_si128();
+ if (reuse_pulses) {
+ /* We reuse pulses from a previous search so we don't have to search them
+ again. */
+ for (j = 0; j < n; j += 4) {
+ __m128 x4, y4;
+ __m128i iy4;
+ iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
+ pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+ _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+ y4 = _mm_cvtepi32_ps(iy4);
+ x4 = _mm_loadu_ps(&x[j]);
+ xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+ yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+ /* Double the y[] vector so we don't have to do it in the search loop.
+ */
+ _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+ }
+ pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+ xy4 = horizontal_sum_ps(xy4);
+ xy = _mm_cvtss_f32(xy4);
+ yy4 = horizontal_sum_ps(yy4);
+ yy = _mm_cvtss_f32(yy4);
+ } else if (k > (n >> 1)) {
+ /* Do a pre-search by projecting on the pyramid. */
+ __m128 rcp4;
+ float sum = _mm_cvtss_f32(sums);
+ /* If x is too small, just replace it with a pulse at 0. This prevents
+ infinities and NaNs from causing too many pulses to be allocated. Here,
+ 64 is an
+ approximation of infinity. */
+ if (sum <= EPSILON) {
+ x[0] = 1.f;
+ for (i = 1; i < n; i++) {
+ x[i] = 0;
+ }
+ sums = _mm_set_ps1(1.f);
+ }
+ /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
+ rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
+ xy4 = yy4 = _mm_setzero_ps();
+ pulses_sum = _mm_setzero_si128();
+ for (j = 0; j < n; j += 4) {
+ __m128 rx4, x4, y4;
+ __m128i iy4;
+ x4 = _mm_loadu_ps(&x[j]);
+ rx4 = _mm_mul_ps(x4, rcp4);
+ iy4 = _mm_cvttps_epi32(rx4);
+ pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+ _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+ y4 = _mm_cvtepi32_ps(iy4);
+ xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+ yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+ /* Double the y[] vector so we don't have to do it in the search loop.
+ */
+ _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+ }
+ pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+ xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
+ yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
+ }
+ x[n] = x[n + 1] = x[n + 2] = -100;
+ y[n] = y[n + 1] = y[n + 2] = 100;
+ }
+
+ /* This should never happen. */
+ OD_ASSERT(pulses_left <= n + 3);
+
+ float lambda_delta_rate[MAXN + 3];
+ if (pulses_left) {
+ /* Hoist lambda to avoid the multiply in the loop. */
+ float lambda =
+ 0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
+ float delta_rate = 3.f / n;
+ __m128 count = _mm_set_ps(3, 2, 1, 0);
+ for (i = 0; i < n; i += 4) {
+ _mm_storeu_ps(&lambda_delta_rate[i],
+ _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
+ count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
+ }
+ }
+ lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
+ 1e30f;
+
+ for (i = 0; i < pulses_left; i++) {
+ int best_id = 0;
+ __m128 xy4, yy4;
+ __m128 max, max2;
+ __m128i count;
+ __m128i pos;
+
+ /* The squared magnitude term gets added anyway, so we might as well
+ add it outside the loop. */
+ yy = yy + 1;
+ xy4 = _mm_load1_ps(&xy);
+ yy4 = _mm_load1_ps(&yy);
+ max = _mm_setzero_ps();
+ pos = _mm_setzero_si128();
+ count = _mm_set_epi32(3, 2, 1, 0);
+ for (j = 0; j < n; j += 4) {
+ __m128 x4, y4, r4;
+ x4 = _mm_loadu_ps(&x[j]);
+ y4 = _mm_loadu_ps(&y[j]);
+ x4 = _mm_add_ps(x4, xy4);
+ y4 = _mm_add_ps(y4, yy4);
+ y4 = _mm_rsqrt_ps(y4);
+ r4 = _mm_mul_ps(x4, y4);
+ /* Subtract lambda. */
+ r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
+ /* Update the index of the max. */
+ pos = _mm_max_epi16(
+ pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
+ /* Update the max. */
+ max = _mm_max_ps(max, r4);
+ /* Update the indices (+4) */
+ count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
+ }
+ /* Horizontal max. */
+ max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
+ max2 =
+ _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
+ /* Now that max2 contains the max at all positions, look at which value(s)
+ of the
+ partial max is equal to the global max. */
+ pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
+ pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
+ pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
+ best_id = _mm_cvtsi128_si32(pos);
+ OD_ASSERT(best_id < n);
+ /* Updating the sums of the new pulse(s) */
+ xy = xy + x[best_id];
+ /* We're multiplying y[j] by two so we don't have to do it here. */
+ yy = yy + y[best_id];
+ /* Only now that we've made the final choice, update y/ypulse. */
+ /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
+ y[best_id] += 2;
+ ypulse[best_id]++;
+ }
+
+ /* Put the original sign back. */
+ for (i = 0; i < n; i += 4) {
+ __m128i y4;
+ __m128i s4;
+ y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
+ s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
+ y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+ _mm_storeu_si128((__m128i *)&ypulse[i], y4);
+ }
+ return xy * rsqrtf(xx * yy + FLT_MIN);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
new file mode 100644
index 000000000..3c4ce8543
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
+#define AOM_COMMON_PVQ_X86_SSE4_H_
+#endif // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 000000000..260faa8c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,1805 @@
+#include <smmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/restoration.h"
+
+/* Calculate four consecutive entries of the intermediate A and B arrays
+ (corresponding to the first loop in the C version of
+ av1_selfguided_restoration)
+*/
+static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
+ __m128i one_over_n, __m128i s, int bit_depth, int idx,
+ int32_t *A, int32_t *B) {
+ __m128i a, b, p;
+#if CONFIG_HIGHBITDEPTH
+ if (bit_depth > 8) {
+ __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
+ __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
+ __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
+ __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+ a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
+ b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
+ a = _mm_mullo_epi32(a, n);
+ b = _mm_mullo_epi32(b, b);
+ p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
+ } else {
+#endif
+ (void)bit_depth;
+ a = _mm_mullo_epi32(sum_sq, n);
+ b = _mm_mullo_epi32(sum, sum);
+ p = _mm_sub_epi32(a, b);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+
+ __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
+ __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
+ SGRPROJ_MTABLE_BITS);
+ z = _mm_min_epi32(z, _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+ x_by_xplus1[_mm_extract_epi32(z, 2)],
+ x_by_xplus1[_mm_extract_epi32(z, 1)],
+ x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ _mm_storeu_si128((__m128i *)&A[idx], a_res);
+
+ __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
+ __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+ __m128i b_int =
+ _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
+ __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
+
+ _mm_storeu_si128((__m128i *)&B[idx], b_res);
+}
+
+static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
+ int src_stride, int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ // Vertical sum
+ // When the width is not a multiple of 4, we know that 'stride' is rounded up
+ // to a multiple of 4. So it is safe for this loop to calculate extra columns
+ // at the right-hand edge of the frame.
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp;
+
+ a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+
+ sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+ tmp = _mm_unpacklo_epi16(a, b);
+ sum_sq = _mm_madd_epi16(tmp, tmp);
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 1; i < height - 2; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ y = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+ }
+}
+
+static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
+ int height, int buf_stride, int eps,
+ int bit_depth) {
+ int i, j;
+
+ // Horizontal sum
+ int width_extend = (width + 3) & ~3;
+ for (i = 0; i < height; ++i) {
+ int h = AOMMIN(2, height - i) + AOMMIN(1, i);
+
+ __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+ __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+ __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+ __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+ // Note: The _mm_slli_si128 call sets up a register containing
+ // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
+ // so that the first element of 'sum' (which should only add two values
+ // together) ends up calculated correctly.
+ __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
+ _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
+ __m128i sum_sq_ = _mm_add_epi32(
+ _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
+ __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
+ __m128i one_over_n =
+ _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
+ one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
+ __m128i s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+ sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+ B);
+
+ n = _mm_set1_epi32(3 * h);
+ one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
+ s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
+
+ // Re-align a1 and b1 so that they start at index i * buf_stride + 3
+ a2 = _mm_alignr_epi8(a2, a1, 12);
+ b2 = _mm_alignr_epi8(b2, b1, 12);
+
+ // Note: When the width is not a multiple of 4, this loop may end up
+ // writing to the last 4 columns of the frame, potentially with incorrect
+ // values (especially for r=2 and r=3).
+ // This is fine, since we fix up those values in the block after this
+ // loop, and in exchange we never have more than four values to
+ // write / fix up after this loop finishes.
+ for (j = 4; j < width_extend - 4; j += 4) {
+ a1 = a2;
+ b1 = b2;
+ a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+ b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+ /* Loop invariant: At this point,
+ a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
+ a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
+ and similar for b1,b2 and B
+ */
+ sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+ _mm_alignr_epi8(b2, b1, 8)));
+ sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+ _mm_alignr_epi8(a2, a1, 8)));
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+ __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+ __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+
+ j = width - 4;
+ switch (width % 4) {
+ case 0:
+ a1 = a2;
+ b1 = b2;
+ a2 = a3;
+ b2 = b3;
+ break;
+ case 1:
+ a1 = _mm_alignr_epi8(a2, a1, 4);
+ b1 = _mm_alignr_epi8(b2, b1, 4);
+ a2 = _mm_alignr_epi8(a3, a2, 4);
+ b2 = _mm_alignr_epi8(b3, b2, 4);
+ break;
+ case 2:
+ a1 = _mm_alignr_epi8(a2, a1, 8);
+ b1 = _mm_alignr_epi8(b2, b1, 8);
+ a2 = _mm_alignr_epi8(a3, a2, 8);
+ b2 = _mm_alignr_epi8(b3, b2, 8);
+ break;
+ case 3:
+ a1 = _mm_alignr_epi8(a2, a1, 12);
+ b1 = _mm_alignr_epi8(b2, b1, 12);
+ a2 = _mm_alignr_epi8(a3, a2, 12);
+ b2 = _mm_alignr_epi8(b3, b2, 12);
+ break;
+ }
+
+ // Zero out the data loaded from "off the edge" of the array
+ __m128i zero = _mm_setzero_si128();
+ a2 = _mm_blend_epi16(a2, zero, 0xfc);
+ b2 = _mm_blend_epi16(b2, zero, 0xfc);
+
+ sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+ _mm_alignr_epi8(b2, b1, 8)));
+ sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+ _mm_alignr_epi8(a2, a1, 8)));
+ n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
+ one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
+ one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
+ s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+ sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+}
+
+static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
+ int src_stride, int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ // Vertical sum
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, c, c2, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp;
+
+ a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+ c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+
+ sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+ // Important: Since c may be up to 2^8, the result on squaring may
+ // be up to 2^16. So we need to zero-extend, not sign-extend.
+ c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
+ tmp = _mm_unpacklo_epi16(a, b);
+ sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 2; i < height - 3; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
+ y = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+ }
+}
+
+static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
+ int height, int buf_stride, int eps,
+ int bit_depth) {
+ int i, j;
+
+ // Horizontal sum
+ int width_extend = (width + 3) & ~3;
+ for (i = 0; i < height; ++i) {
+ int h = AOMMIN(3, height - i) + AOMMIN(2, i);
+
+ __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+ __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+ __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+ __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+ __m128i sum_ = _mm_add_epi32(
+ _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
+ _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
+ _mm_alignr_epi8(b2, b1, 8));
+ __m128i sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
+ _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
+ _mm_alignr_epi8(a2, a1, 8));
+
+ __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
+ __m128i one_over_n =
+ _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
+ one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
+ __m128i s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+ sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+ B);
+
+ // Re-align a1 and b1 so that they start at index i * buf_stride + 2
+ a2 = _mm_alignr_epi8(a2, a1, 8);
+ b2 = _mm_alignr_epi8(b2, b1, 8);
+
+ n = _mm_set1_epi32(5 * h);
+ one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
+ s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
+
+ for (j = 4; j < width_extend - 4; j += 4) {
+ a1 = a2;
+ a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+ b1 = b2;
+ b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+ /* Loop invariant: At this point,
+ a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
+ a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
+ and similar for b1,b2 and B
+ */
+ sum_ = _mm_add_epi32(
+ _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+ _mm_alignr_epi8(b2, b1, 8))),
+ _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+ sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+ _mm_alignr_epi8(a2, a1, 8))),
+ _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+ // If the width is not a multiple of 4, we need to reset j to width - 4
+ // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
+ __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+ __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+
+ j = width - 4;
+ switch (width % 4) {
+ case 0:
+ a1 = a2;
+ b1 = b2;
+ a2 = a3;
+ b2 = b3;
+ break;
+ case 1:
+ a1 = _mm_alignr_epi8(a2, a1, 4);
+ b1 = _mm_alignr_epi8(b2, b1, 4);
+ a2 = _mm_alignr_epi8(a3, a2, 4);
+ b2 = _mm_alignr_epi8(b3, b2, 4);
+ break;
+ case 2:
+ a1 = _mm_alignr_epi8(a2, a1, 8);
+ b1 = _mm_alignr_epi8(b2, b1, 8);
+ a2 = _mm_alignr_epi8(a3, a2, 8);
+ b2 = _mm_alignr_epi8(b3, b2, 8);
+ break;
+ case 3:
+ a1 = _mm_alignr_epi8(a2, a1, 12);
+ b1 = _mm_alignr_epi8(b2, b1, 12);
+ a2 = _mm_alignr_epi8(a3, a2, 12);
+ b2 = _mm_alignr_epi8(b3, b2, 12);
+ break;
+ }
+
+ // Zero out the data loaded from "off the edge" of the array
+ __m128i zero = _mm_setzero_si128();
+ a2 = _mm_blend_epi16(a2, zero, 0xf0);
+ b2 = _mm_blend_epi16(b2, zero, 0xf0);
+
+ sum_ = _mm_add_epi32(
+ _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+ _mm_alignr_epi8(b2, b1, 8))),
+ _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+ sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+ _mm_alignr_epi8(a2, a1, 8))),
+ _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+ n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
+ one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
+ one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
+ s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
+ sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+}
+
+static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
+ int src_stride, int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ // Vertical sum over 7-pixel regions, 4 columns at a time
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, c, d, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp, tmp2;
+
+ a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+ b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+ c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+ d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+
+ sum = _mm_cvtepi16_epi32(
+ _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+ tmp = _mm_unpacklo_epi16(a, b);
+ tmp2 = _mm_unpacklo_epi16(c, d);
+ sum_sq =
+ _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 3; i < height - 4; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
+ y = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+ }
+}
+
+static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
+ int height, int buf_stride, int eps,
+ int bit_depth) {
+ int i, j;
+ // Horizontal sum over 7-pixel regions of dst
+ int width_extend = (width + 3) & ~3;
+ for (i = 0; i < height; ++i) {
+ int h = AOMMIN(4, height - i) + AOMMIN(3, i);
+
+ __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+ __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+ __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+ __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+ __m128i sum_ = _mm_add_epi32(
+ _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
+ _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+ _mm_alignr_epi8(b2, b1, 8)),
+ _mm_alignr_epi8(b2, b1, 12)));
+ __m128i sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
+ _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+ _mm_alignr_epi8(a2, a1, 8)),
+ _mm_alignr_epi8(a2, a1, 12)));
+
+ __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
+ __m128i one_over_n =
+ _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
+ one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
+ __m128i s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
+ sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+ B);
+
+ // Re-align a1 and b1 so that they start at index i * buf_stride + 1
+ a2 = _mm_alignr_epi8(a2, a1, 4);
+ b2 = _mm_alignr_epi8(b2, b1, 4);
+
+ n = _mm_set1_epi32(7 * h);
+ one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
+ s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
+
+ for (j = 4; j < width_extend - 4; j += 4) {
+ a1 = a2;
+ a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+ b1 = b2;
+ b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+ __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
+ __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
+ /* Loop invariant: At this point,
+ a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
+ a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
+ a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
+ and similar for b1,b2,b3 and B
+ */
+ sum_ = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+ _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+ _mm_alignr_epi8(b2, b1, 12))),
+ _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
+ _mm_alignr_epi8(b3, b2, 8)));
+ sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+ _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+ _mm_alignr_epi8(a2, a1, 12))),
+ _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
+ _mm_alignr_epi8(a3, a2, 8)));
+
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+ __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+ __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+
+ j = width - 4;
+ switch (width % 4) {
+ case 0:
+ a1 = a2;
+ b1 = b2;
+ a2 = a3;
+ b2 = b3;
+ break;
+ case 1:
+ a1 = _mm_alignr_epi8(a2, a1, 4);
+ b1 = _mm_alignr_epi8(b2, b1, 4);
+ a2 = _mm_alignr_epi8(a3, a2, 4);
+ b2 = _mm_alignr_epi8(b3, b2, 4);
+ break;
+ case 2:
+ a1 = _mm_alignr_epi8(a2, a1, 8);
+ b1 = _mm_alignr_epi8(b2, b1, 8);
+ a2 = _mm_alignr_epi8(a3, a2, 8);
+ b2 = _mm_alignr_epi8(b3, b2, 8);
+ break;
+ case 3:
+ a1 = _mm_alignr_epi8(a2, a1, 12);
+ b1 = _mm_alignr_epi8(b2, b1, 12);
+ a2 = _mm_alignr_epi8(a3, a2, 12);
+ b2 = _mm_alignr_epi8(b3, b2, 12);
+ break;
+ }
+
+ // Zero out the data loaded from "off the edge" of the array
+ __m128i zero = _mm_setzero_si128();
+ a2 = _mm_blend_epi16(a2, zero, 0xc0);
+ b2 = _mm_blend_epi16(b2, zero, 0xc0);
+
+ sum_ = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+ _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+ _mm_alignr_epi8(b2, b1, 12))),
+ _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
+ _mm_alignr_epi8(zero, b2, 8)));
+ sum_sq_ = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+ _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+ _mm_alignr_epi8(a2, a1, 12))),
+ _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
+ _mm_alignr_epi8(zero, a2, 8)));
+
+ n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
+ one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
+ one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
+ s = _mm_set_epi32(
+ sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+ sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
+ calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+ A, B);
+ }
+}
+
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
+ int stride, int32_t *dst, int dst_stride,
+ int r, int eps, int32_t *tmpbuf) {
+ int32_t *A = tmpbuf;
+ int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int i, j;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes for efficiency.
+ int buf_stride = ((width + 3) & ~3) + 16;
+
+ // Don't filter tiles with dimensions < 5 on any axis
+ if ((width < 5) || (height < 5)) return;
+
+ if (r == 1) {
+ selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
+ selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+ } else if (r == 2) {
+ selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
+ selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+ } else if (r == 3) {
+ selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
+ selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+ } else {
+ assert(0);
+ }
+
+ {
+ i = 0;
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+ A[k + buf_stride + 1];
+ const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+ B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+ A[k + buf_stride - 1] + A[k + buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+ B[k + buf_stride - 1] + B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+ A[k + buf_stride - 1];
+ const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+ B[k + buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+ for (i = 1; i < height - 1; ++i) {
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+ A[k + 1] + A[k - buf_stride + 1] +
+ A[k + buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+ B[k + 1] + B[k - buf_stride + 1] +
+ B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+
+ // Vectorize the innermost loop
+ for (j = 1; j < width - 1; j += 4) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+
+ __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+ __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+ __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+ __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+ __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+ __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+ __m128i a0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+ _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+ _mm_alignr_epi8(tmp5, tmp4, 4))),
+ _mm_alignr_epi8(tmp1, tmp0, 4));
+ __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+ _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+ _mm_alignr_epi8(tmp5, tmp4, 8)));
+ __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+ __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+ __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+ __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+ __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+ __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+ __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+ __m128i b0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+ _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+ _mm_alignr_epi8(tmp11, tmp10, 4))),
+ _mm_alignr_epi8(tmp7, tmp6, 4));
+ __m128i b1 =
+ _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+ _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+ _mm_alignr_epi8(tmp11, tmp10, 8)));
+ __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+ __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+ __m128i rounding = _mm_set1_epi32(
+ (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+ __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ _mm_storeu_si128((__m128i *)&dst[m], w);
+ }
+
+ // Deal with any extra pixels at the right-hand edge of the frame
+ // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+ for (; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a =
+ (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+ 4 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 3;
+ const int32_t b =
+ (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+ 4 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 3;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+ A[k - 1] + A[k - buf_stride - 1] +
+ A[k + buf_stride - 1];
+ const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+ B[k - 1] + B[k - buf_stride - 1] +
+ B[k + buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+
+ {
+ i = height - 1;
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+ A[k - buf_stride + 1];
+ const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+ B[k - buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+ A[k - buf_stride - 1] + A[k - buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+ B[k - buf_stride - 1] + B[k - buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+ A[k - buf_stride - 1];
+ const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+ B[k - buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+}
+
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
+ int32_t *dst, int dst_stride, int corner,
+ int edge) {
+ int i, j;
+ const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+ {
+ i = 0;
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] = center * dgd[k] +
+ edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+ corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+ dgd[k - 1] + dgd[k + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+ }
+ }
+ {
+ i = height - 1;
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] = center * dgd[k] +
+ edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+ corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+ dgd[k - 1] + dgd[k + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+ }
+ }
+ __m128i center_ = _mm_set1_epi16(center);
+ __m128i edge_ = _mm_set1_epi16(edge);
+ __m128i corner_ = _mm_set1_epi16(corner);
+ for (i = 1; i < height - 1; ++i) {
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+ corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+ dgd[k - stride] + dgd[k + stride]);
+ }
+ // Process in units of 8 pixels at a time.
+ for (j = 1; j < width - 8; j += 8) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+
+ __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+ __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+ __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+ __m128i tl = _mm_cvtepu8_epi16(a);
+ __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
+ __m128i cl = _mm_cvtepu8_epi16(b);
+ __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
+ __m128i bl = _mm_cvtepu8_epi16(c);
+ __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
+
+ __m128i x = _mm_alignr_epi8(cr, cl, 2);
+ __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
+ _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
+ _mm_alignr_epi8(cr, cl, 4)));
+ __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
+ _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
+ _mm_alignr_epi8(br, bl, 4)));
+
+ __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+ _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+ _mm_mullo_epi16(z, corner_)));
+
+ _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+ _mm_storeu_si128((__m128i *)&dst[l + 4],
+ _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
+ }
+ // If there are enough pixels left in this row, do another batch of 4
+ // pixels.
+ for (; j < width - 4; j += 4) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+
+ __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
+ __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
+ __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
+
+ __m128i tl = _mm_cvtepu8_epi16(a);
+ __m128i cl = _mm_cvtepu8_epi16(b);
+ __m128i bl = _mm_cvtepu8_epi16(c);
+
+ __m128i x = _mm_srli_si128(cl, 2);
+ __m128i y = _mm_add_epi16(
+ _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
+ _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
+ __m128i z = _mm_add_epi16(
+ _mm_add_epi16(tl, bl),
+ _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
+
+ __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+ _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+ _mm_mullo_epi16(z, corner_)));
+
+ _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+ }
+ // Handle any leftover pixels
+ for (; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+ corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+ dgd[k - stride + 1] + dgd[k + stride + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+ corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+ dgd[k - stride] + dgd[k + stride]);
+ }
+ }
+}
+
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
+ int stride, int eps, int *xqd,
+ uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf) {
+ int xq[2];
+ int32_t *flt1 = tmpbuf;
+ int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+ int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+ int i, j;
+ assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+ av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
+ sgr_params[eps].corner, sgr_params[eps].edge);
+#else
+ av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
+ sgr_params[eps].r1, sgr_params[eps].e1,
+ tmpbuf2);
+#endif // USE_HIGHPASS_IN_SGRPROJ
+ av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
+ sgr_params[eps].r2, sgr_params[eps].e2,
+ tmpbuf2);
+ decode_xq(xqd, xq);
+
+ __m128i xq0 = _mm_set1_epi32(xq[0]);
+ __m128i xq1 = _mm_set1_epi32(xq[1]);
+ for (i = 0; i < height; ++i) {
+ // Calculate output in batches of 8 pixels
+ for (j = 0; j < width; j += 8) {
+ const int k = i * width + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ __m128i src =
+ _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
+ SGRPROJ_RST_BITS);
+
+ const __m128i u_0 = _mm_cvtepu16_epi32(src);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+ const __m128i f1_0 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+ const __m128i f2_0 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+ const __m128i f1_1 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+ const __m128i f2_1 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+ const __m128i v_0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+ _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+ const __m128i v_1 = _mm_add_epi32(
+ _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+ _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+ const __m128i rounding =
+ _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+ const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+ const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+ _mm_storel_epi64((__m128i *)&dst[m], res);
+ }
+ // Process leftover pixels
+ for (; j < width; ++j) {
+ const int k = i * width + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+ const int32_t f1 = (int32_t)flt1[k] - u;
+ const int32_t f2 = (int32_t)flt2[k] - u;
+ const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+ const int16_t w =
+ (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ dst[m] = (uint16_t)clip_pixel(w);
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+// Only the vertical sums need to be adjusted for highbitdepth
+
+static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
+ int height, int src_stride,
+ int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp;
+
+ a = _mm_loadl_epi64((__m128i *)&src[j]);
+ b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+
+ sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+ tmp = _mm_unpacklo_epi16(a, b);
+ sum_sq = _mm_madd_epi16(tmp, tmp);
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 1; i < height - 2; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ y = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+ }
+}
+
+static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
+ int height, int src_stride,
+ int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, c, c2, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp;
+
+ a = _mm_loadl_epi64((__m128i *)&src[j]);
+ b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+ c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+
+ sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+ // Important: We need to widen *before* squaring here, since
+ // c^2 may be up to 2^24.
+ c = _mm_cvtepu16_epi32(c);
+ c2 = _mm_mullo_epi32(c, c);
+ tmp = _mm_unpacklo_epi16(a, b);
+ sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 2; i < height - 3; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ y = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+ }
+}
+
+static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
+ int height, int src_stride,
+ int32_t *A, int32_t *B,
+ int buf_stride) {
+ int i, j;
+
+ int width_extend = (width + 3) & ~3;
+ for (j = 0; j < width_extend; j += 4) {
+ __m128i a, b, c, d, x, y, x2, y2;
+ __m128i sum, sum_sq, tmp, tmp2;
+
+ a = _mm_loadl_epi64((__m128i *)&src[j]);
+ b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+ c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+ d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
+
+ sum = _mm_cvtepi16_epi32(
+ _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+ tmp = _mm_unpacklo_epi16(a, b);
+ tmp2 = _mm_unpacklo_epi16(c, d);
+ sum_sq =
+ _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+ _mm_store_si128((__m128i *)&B[j], sum);
+ _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+ sum = _mm_add_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_add_epi32(sum_sq, x2);
+
+ for (i = 3; i < height - 4; ++i) {
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+ y = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
+
+ sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+ x2 = _mm_mullo_epi32(x, x);
+ y2 = _mm_mullo_epi32(y, y);
+
+ sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ }
+ _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+ x = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+ sum = _mm_sub_epi32(sum, x);
+ x2 = _mm_mullo_epi32(x, x);
+ sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+ _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+ _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+ }
+}
+
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
+ int height, int stride,
+ int32_t *dst, int dst_stride,
+ int bit_depth, int r, int eps,
+ int32_t *tmpbuf) {
+ int32_t *A = tmpbuf;
+ int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int i, j;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes for efficiency.
+ int buf_stride = ((width + 3) & ~3) + 16;
+
+ // Don't filter tiles with dimensions < 5 on any axis
+ if ((width < 5) || (height < 5)) return;
+
+ if (r == 1) {
+ highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
+ buf_stride);
+ selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+ } else if (r == 2) {
+ highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
+ buf_stride);
+ selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+ } else if (r == 3) {
+ highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
+ buf_stride);
+ selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+ } else {
+ assert(0);
+ }
+
+ {
+ i = 0;
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+ A[k + buf_stride + 1];
+ const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+ B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+ A[k + buf_stride - 1] + A[k + buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+ B[k + buf_stride - 1] + B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+ A[k + buf_stride - 1];
+ const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+ B[k + buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+ for (i = 1; i < height - 1; ++i) {
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+ A[k + 1] + A[k - buf_stride + 1] +
+ A[k + buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+ B[k + 1] + B[k - buf_stride + 1] +
+ B[k + buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+
+ // Vectorize the innermost loop
+ for (j = 1; j < width - 1; j += 4) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+
+ __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+ __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+ __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+ __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+ __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+ __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+ __m128i a0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+ _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+ _mm_alignr_epi8(tmp5, tmp4, 4))),
+ _mm_alignr_epi8(tmp1, tmp0, 4));
+ __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+ _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+ _mm_alignr_epi8(tmp5, tmp4, 8)));
+ __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+ __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+ __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+ __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+ __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+ __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+ __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+ __m128i b0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+ _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+ _mm_alignr_epi8(tmp11, tmp10, 4))),
+ _mm_alignr_epi8(tmp7, tmp6, 4));
+ __m128i b1 =
+ _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+ _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+ _mm_alignr_epi8(tmp11, tmp10, 8)));
+ __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+ __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+ __m128i rounding = _mm_set1_epi32(
+ (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+ __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ _mm_storeu_si128((__m128i *)&dst[m], w);
+ }
+
+ // Deal with any extra pixels at the right-hand edge of the frame
+ // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+ for (; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a =
+ (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+ 4 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 3;
+ const int32_t b =
+ (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+ 4 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 3;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+ A[k - 1] + A[k - buf_stride - 1] +
+ A[k + buf_stride - 1];
+ const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+ B[k - 1] + B[k - buf_stride - 1] +
+ B[k + buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+
+ {
+ i = height - 1;
+ j = 0;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+ A[k - buf_stride + 1];
+ const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+ B[k - buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+ A[k - buf_stride - 1] + A[k - buf_stride + 1];
+ const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+ B[k - buf_stride - 1] + B[k - buf_stride + 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ j = width - 1;
+ {
+ const int k = i * buf_stride + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 3;
+ const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+ A[k - buf_stride - 1];
+ const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+ B[k - buf_stride - 1];
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+}
+
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
+ int stride, int32_t *dst, int dst_stride,
+ int corner, int edge) {
+ int i, j;
+ const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+ {
+ i = 0;
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] = center * dgd[k] +
+ edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+ corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+ dgd[k - 1] + dgd[k + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+ }
+ }
+ __m128i center_ = _mm_set1_epi32(center);
+ __m128i edge_ = _mm_set1_epi32(edge);
+ __m128i corner_ = _mm_set1_epi32(corner);
+ for (i = 1; i < height - 1; ++i) {
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+ corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+ dgd[k - stride] + dgd[k + stride]);
+ }
+ // Process 4 pixels at a time
+ for (j = 1; j < width - 4; j += 4) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+
+ __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+ __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+ __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+ __m128i tl = _mm_cvtepu16_epi32(a);
+ __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+ __m128i cl = _mm_cvtepu16_epi32(b);
+ __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
+ __m128i bl = _mm_cvtepu16_epi32(c);
+ __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
+
+ __m128i x = _mm_alignr_epi8(cr, cl, 4);
+ __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
+ _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
+ _mm_alignr_epi8(cr, cl, 8)));
+ __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
+ _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
+ _mm_alignr_epi8(br, bl, 8)));
+
+ __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
+ _mm_add_epi32(_mm_mullo_epi32(y, edge_),
+ _mm_mullo_epi32(z, corner_)));
+
+ _mm_storeu_si128((__m128i *)&dst[l], res);
+ }
+ // Handle any leftover pixels
+ for (; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+ corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+ dgd[k - stride + 1] + dgd[k + stride + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] +
+ edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+ corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+ dgd[k - stride] + dgd[k + stride]);
+ }
+ }
+ {
+ i = height - 1;
+ j = 0;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+ }
+ for (j = 1; j < width - 1; ++j) {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] = center * dgd[k] +
+ edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+ corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+ dgd[k - 1] + dgd[k + 1]);
+ }
+ j = width - 1;
+ {
+ const int k = i * stride + j;
+ const int l = i * dst_stride + j;
+ dst[l] =
+ center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+ corner *
+ (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+ }
+ }
+}
+
+void apply_selfguided_restoration_highbd_sse4_1(
+ uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
+ int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+ int xq[2];
+ int32_t *flt1 = tmpbuf;
+ int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+ int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+ int i, j;
+ assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+ av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
+ sgr_params[eps].corner,
+ sgr_params[eps].edge);
+#else
+ av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
+ width, bit_depth, sgr_params[eps].r1,
+ sgr_params[eps].e1, tmpbuf2);
+#endif // USE_HIGHPASS_IN_SGRPROJ
+ av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
+ width, bit_depth, sgr_params[eps].r2,
+ sgr_params[eps].e2, tmpbuf2);
+ decode_xq(xqd, xq);
+
+ __m128i xq0 = _mm_set1_epi32(xq[0]);
+ __m128i xq1 = _mm_set1_epi32(xq[1]);
+ for (i = 0; i < height; ++i) {
+ // Calculate output in batches of 8 pixels
+ for (j = 0; j < width; j += 8) {
+ const int k = i * width + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ __m128i src =
+ _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
+
+ const __m128i u_0 = _mm_cvtepu16_epi32(src);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+ const __m128i f1_0 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+ const __m128i f2_0 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+ const __m128i f1_1 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+ const __m128i f2_1 =
+ _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+ const __m128i v_0 = _mm_add_epi32(
+ _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+ _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+ const __m128i v_1 = _mm_add_epi32(
+ _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+ _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+ const __m128i rounding =
+ _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+ const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+ const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+ const __m128i res = _mm_min_epi16(tmp, max);
+
+ _mm_store_si128((__m128i *)&dst[m], res);
+ }
+ // Process leftover pixels
+ for (; j < width; ++j) {
+ const int k = i * width + j;
+ const int l = i * stride + j;
+ const int m = i * dst_stride + j;
+ const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+ const int32_t f1 = (int32_t)flt1[k] - u;
+ const int32_t f2 = (int32_t)flt2[k] - u;
+ const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+ const int16_t w =
+ (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+ }
+ }
+}
+
+#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..925e4650d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
+ int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int ref_frm,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ // (x, y) coordinates of the center of this block in the destination
+ // image
+ int32_t dst_x = p_col + j + 4;
+ int32_t dst_y = p_row + i + 4;
+
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ } else if (ix4 >= width + 6) {
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ } else {
+ int sx = sx4 + alpha * (-4) + beta * k +
+ // Include rounding and offset here
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ // Load source pixels
+ __m128i zero = _mm_setzero_si128();
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ // Filter even-index pixels
+ __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ __m128i round_const =
+ _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+ // Calculate filtered results
+ __m128i src_0 = _mm_unpacklo_epi8(src, zero);
+ __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+ __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+ __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+ __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
+
+ // Filter odd-index pixels
+ __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+ __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+ __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+ __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+ __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ __m128i *src = tmp + (k + 4);
+ __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ // Round and pack into 8 bits
+ __m128i round_const =
+ _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+ __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ if (ref_frm) {
+ const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+ res_8bit = _mm_avg_epu8(res_8bit, orig);
+ }
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+ }
+}