summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm8
-rw-r--r--third_party/aom/aom_dsp/x86/common_avx2.h147
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h24
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h20
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c239
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm197
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c1256
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c521
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c873
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c1010
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm2
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm7
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c249
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c10
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_avx2.c413
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.asm146
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.c684
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.c885
-rw-r--r--third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h22
-rw-r--r--third_party/aom/aom_dsp/x86/inv_txfm_sse2.h12
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c144
-rw-r--r--third_party/aom/aom_dsp/x86/lpf_common_sse2.h130
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c16
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c14
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_sse4.c4
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c16
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c38
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_sse2.asm2
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse2.asm6
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h130
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_intrin.h20
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c30
32 files changed, 5938 insertions, 1337 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 357f37401..8688fb544 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -346,9 +346,15 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
psraw m0, 7
psraw m4, 7
%ifidn %1, h8_add_src
+%if ARCH_X86=1 && CONFIG_PIC=1
+ pcmpeqb m2, m2 ;all ones
+ psrlw m2, 8 ;even_byte_mask
+%else
+ mova m2, [GLOBAL(even_byte_mask)]
+%endif
movu m5, [srcq]
mova m7, m5
- pand m5, [even_byte_mask]
+ pand m5, m2
psrlw m7, 8
paddsw m0, m5
paddsw m4, m7
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
new file mode 100644
index 000000000..5f9596a74
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_COMMON_AVX2_H
+#define AOM_DSP_X86_COMMON_AVX2_H
+
+#include <immintrin.h>
+
+#include "./aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
index d3aceae00..86df4a6f6 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
@@ -15,21 +15,21 @@
#include "./aom_config.h"
static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-#if CONFIG_HIGHBITDEPTH
- const __m256i zero = _mm256_setzero_si256();
- const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
- __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
- __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+ __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+ __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
- __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
- __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+ __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+ __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
- _mm256_storeu_si256((__m256i *)out, y0);
- _mm256_storeu_si256((__m256i *)(out + 8), y1);
-#else
- _mm256_storeu_si256((__m256i *)out, *coeff);
-#endif
+ _mm256_storeu_si256((__m256i *)out, y0);
+ _mm256_storeu_si256((__m256i *)(out + 8), y1);
+ } else {
+ _mm256_storeu_si256((__m256i *)out, *coeff);
+ }
}
#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 26b2db2e0..58e8971dd 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32(
}
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_store_si128((__m128i *)(dst_ptr), out0);
- _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-#else
- _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-#endif // CONFIG_HIGHBITDEPTH
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ } else {
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+ }
}
static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
new file mode 100644
index 000000000..41b55c985
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// D45E_PRED
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
+ const __m256i *z) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a = _mm256_avg_epu16(*x, *z);
+ const __m256i b =
+ _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
+ return _mm256_avg_epu16(b, *y);
+}
+
+static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
+ const __m256i *a2, uint16_t **dst,
+ ptrdiff_t stride) {
+ const __m256i y = avg3_epu16(a0, a1, a2);
+ _mm256_storeu_si256((__m256i *)*dst, y);
+ *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 9);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
+ x0 = _mm256_insert_epi16(x0, above[23], 15);
+ const __m256i y = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 15);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ x2 = _mm256_insert_epi16(x2, above[31], 15);
+ const __m256i y = avg3_epu16(&x0, &x1, &x2);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 33);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+ x0 = _mm256_insert_epi16(x0, above[47], 15);
+ const __m256i y = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+ __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+ uint16_t *dst1 = dst;
+ uint16_t *dst2 = dst + 16;
+
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+ } while (i < 15);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i u = avg3_epu16(&x0, &x1, &x2);
+ _mm256_storeu_si256((__m256i *)dst1, u);
+
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
+ y2 = _mm256_insert_epi16(y2, above[47], 15);
+ u = avg3_epu16(&y0, &y1, &y2);
+ _mm256_storeu_si256((__m256i *)dst2, u);
+}
+
+void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+ __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+ uint16_t *dst1 = dst;
+ uint16_t *dst2 = dst + 16;
+
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+ } while (i < 33);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+ __m256i u = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst1, u);
+
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
+ y0 = _mm256_insert_epi16(y0, above[63], 15);
+ u = avg3_epu16(&y1, &y2, &y0);
+ _mm256_storeu_si256((__m256i *)dst2, u);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
index 5d84ef8a7..91b3d126c 100644
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm
@@ -257,200 +257,3 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
dec nlines4d
jnz .loop
REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
- movd m1, [aboveq-2]
- movq m0, [aboveq]
- pshuflw m1, m1, 0x0
- movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
- movlhps m1, m1 ; tl tl tl tl tl tl tl tl
- ; Get the values to compute the maximum value at this bit depth
- pcmpeqw m3, m3
- movd m4, bpsd
- psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
- psllw m3, m4
- pcmpeqw m2, m2
- pxor m4, m4 ; min possible value
- pxor m3, m2 ; max possible value
- mova m1, [leftq]
- pshuflw m2, m1, 0x0
- pshuflw m5, m1, 0x55
- movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
- paddw m2, m0
- ;Clamp to the bit-depth
- pminsw m2, m3
- pmaxsw m2, m4
- ;Store the values
- movq [dstq ], m2
- movhpd [dstq+strideq*2], m2
- lea dstq, [dstq+strideq*4]
- pshuflw m2, m1, 0xaa
- pshuflw m5, m1, 0xff
- movlhps m2, m5
- paddw m2, m0
- ;Clamp to the bit-depth
- pminsw m2, m3
- pmaxsw m2, m4
- ;Store the values
- movq [dstq ], m2
- movhpd [dstq+strideq*2], m2
- RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
- movd m1, [aboveq-2]
- mova m0, [aboveq]
- pshuflw m1, m1, 0x0
- ; Get the values to compute the maximum value at this bit depth
- mov oned, 1
- pxor m3, m3
- pxor m4, m4
- pinsrw m3, oned, 0
- pinsrw m4, bpsd, 0
- pshuflw m3, m3, 0x0
- DEFINE_ARGS dst, stride, line, left
- punpcklqdq m3, m3
- mov lineq, -4
- mova m2, m3
- punpcklqdq m1, m1
- psllw m3, m4
- add leftq, 16
- psubw m3, m2 ; max possible value
- pxor m4, m4 ; min possible value
- psubw m0, m1
-.loop:
- movd m1, [leftq+lineq*4]
- movd m2, [leftq+lineq*4+2]
- pshuflw m1, m1, 0x0
- pshuflw m2, m2, 0x0
- punpcklqdq m1, m1
- punpcklqdq m2, m2
- paddw m1, m0
- paddw m2, m0
- ;Clamp to the bit-depth
- pminsw m1, m3
- pminsw m2, m3
- pmaxsw m1, m4
- pmaxsw m2, m4
- ;Store the values
- mova [dstq ], m1
- mova [dstq+strideq*2], m2
- lea dstq, [dstq+strideq*4]
- inc lineq
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
- movd m2, [aboveq-2]
- mova m0, [aboveq]
- mova m1, [aboveq+16]
- pshuflw m2, m2, 0x0
- ; Get the values to compute the maximum value at this bit depth
- pcmpeqw m3, m3
- movd m4, bpsd
- punpcklqdq m2, m2
- psllw m3, m4
- pcmpeqw m5, m5
- pxor m4, m4 ; min possible value
- pxor m3, m5 ; max possible value
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -8
- psubw m0, m2
- psubw m1, m2
-.loop:
- movd m7, [leftq]
- pshuflw m5, m7, 0x0
- pshuflw m2, m7, 0x55
- punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1
- punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2
- paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1
- paddw m5, m1 ; t5-tl+l1 to t8-tl+l1
- pminsw m6, m3
- pminsw m5, m3
- pmaxsw m6, m4 ; Clamp to the bit-depth
- pmaxsw m5, m4
- mova [dstq ], m6
- mova [dstq +16], m5
- paddw m6, m2, m0
- paddw m2, m1
- pminsw m6, m3
- pminsw m2, m3
- pmaxsw m6, m4
- pmaxsw m2, m4
- mova [dstq+strideq*2 ], m6
- mova [dstq+strideq*2+16], m2
- lea dstq, [dstq+strideq*4]
- inc lineq
- lea leftq, [leftq+4]
-
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
- movd m0, [aboveq-2]
- mova m1, [aboveq]
- mova m2, [aboveq+16]
- mova m3, [aboveq+32]
- mova m4, [aboveq+48]
- pshuflw m0, m0, 0x0
- ; Get the values to compute the maximum value at this bit depth
- pcmpeqw m5, m5
- movd m6, bpsd
- psllw m5, m6
- pcmpeqw m7, m7
- pxor m6, m6 ; min possible value
- pxor m5, m7 ; max possible value
- punpcklqdq m0, m0
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -16
- psubw m1, m0
- psubw m2, m0
- psubw m3, m0
- psubw m4, m0
-.loop:
- movd m7, [leftq]
- pshuflw m7, m7, 0x0
- punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
- paddw m0, m7, m1
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq ], m0
- paddw m0, m7, m2
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq +16], m0
- paddw m0, m7, m3
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq +32], m0
- paddw m0, m7, m4
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq +48], m0
- movd m7, [leftq+2]
- pshuflw m7, m7, 0x0
- punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
- paddw m0, m7, m1
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq+strideq*2 ], m0
- paddw m0, m7, m2
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq+strideq*2+16], m0
- paddw m0, m7, m3
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq+strideq*2+32], m0
- paddw m0, m7, m4
- pminsw m0, m5
- pmaxsw m0, m6
- mova [dstq+strideq*2+48], m0
- lea dstq, [dstq+strideq*4]
- lea leftq, [leftq+4]
- inc lineq
- jnz .loop
- REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 000000000..691e166cf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,1256 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+ dst += stride << 2;
+ left += 4;
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+ dst += stride << 3;
+ left += 8;
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sum = dc_sum_8(left);
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, const uint16_t *above) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_storel_epi64((__m128i *)dst, above_u16);
+ _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_above = dc_sum_4(above);
+ const __m128i sum_left = dc_sum_8(left);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_left = dc_sum_4(left);
+ const __m128i sum_above = dc_sum_8(above);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_32(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = _mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+ const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+ const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+ const __m128i row0 = _mm_srli_si128(avg2, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg2, 4);
+ const __m128i row3 = _mm_srli_si128(avg3, 2);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+
+ dst -= stride;
+ dst[0] = _mm_extract_epi16(avg3, 1);
+ dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+ const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+ const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+ const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+ const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+ const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+ const __m128i row0 = _mm_srli_si128(avg3, 6);
+ const __m128i row1 = _mm_srli_si128(avg3, 4);
+ const __m128i row2 = _mm_srli_si128(avg3, 2);
+ const __m128i row3 = avg3;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int I = left[0];
+ const int J = left[1];
+ const int K = left[2];
+ const int L = left[3];
+ const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+ const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+ const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+ const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+ const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+ const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+ const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+ const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+ const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+ const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+ const __m128i row2 = _mm_srli_si128(row3, 4);
+ const __m128i row1 = _mm_srli_si128(row3, 8);
+ const __m128i row0 = _mm_srli_si128(avg3, 4);
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst[0] = _mm_extract_epi16(avg2, 3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+}
+
+void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i h76543210 = _mm_load_si128((const __m128i *)above);
+ __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
+ __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
+ __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
+ __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
+ __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
+ dst += stride;
+
+ // hcba98765
+ h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
+ h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
+ // hxcba9876
+ hx7654321 = _mm_srli_si128(h76543210, 2);
+ // hxxcba987
+ hx8765432 = _mm_srli_si128(h76543210, 4);
+ avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+}
+
+void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+ __m128i y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 3));
+ y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + 4));
+ y = avg3_epu16(&x2, &x0, &x1);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + 5));
+ x2 = _mm_insert_epi16(x2, above[11], 7);
+ y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+}
+
+static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
+ const __m128i *a2, uint16_t **dst,
+ ptrdiff_t stride) {
+ const __m128i y = avg3_epu16(a0, a1, a2);
+ _mm_storeu_si128((__m128i *)*dst, y);
+ *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+ } while (i < 9);
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 9));
+ x0 = _mm_insert_epi16(x0, above[15], 7);
+ const __m128i y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+ } while (i < 15);
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 15));
+ __m128i y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + 16));
+ y = avg3_epu16(&x2, &x0, &x1);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + 17));
+ x2 = _mm_insert_epi16(x2, above[23], 7);
+ y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
new file mode 100644
index 000000000..b089a3f43
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+ const __m128i *z) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a = _mm_avg_epu16(*x, *z);
+ const __m128i b =
+ _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+ return _mm_avg_epu16(b, *y);
+}
+
+DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
+};
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+ *a = _mm_shuffle_epi8(*a, *rotrw);
+ return *a;
+}
+
+void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i IXABCDEF =
+ _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+ const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+ __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+ __m128i rowa = avg2;
+ __m128i rowb = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; i += 2) {
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb);
+ dst += stride;
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+ }
+}
+
+void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_srli_si128(L1, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ dst += stride;
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+ const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+ const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+ const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+ const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+ const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+ const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+ const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+ const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+ const __m128i L3_ = _mm_srli_si128(L3, 2);
+ __m128i rowa_0 = avg2_0;
+ __m128i rowa_1 = avg2_1;
+ __m128i rowa_2 = avg2_2;
+ __m128i rowa_3 = avg2_3;
+ __m128i rowb_0 = avg3_0;
+ __m128i rowb_1 = avg3_1;
+ __m128i rowb_2 = avg3_2;
+ __m128i rowb_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+ avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+ avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+ avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; j += 2) {
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, rowb_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+ dst += stride;
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+ rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+ rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+ rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+ }
+ }
+}
+
+void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+ const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+ __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ __m128i rowa = avg3;
+ int i;
+ (void)bd;
+ for (i = 0; i < 8; ++i) {
+ rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa);
+ dst += stride;
+ }
+}
+
+void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_srli_si128(B1, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i avg3_left[2];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ for (i = 0; i < 2; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ dst += stride;
+ }
+ }
+}
+
+void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_load_si128((const __m128i *)above);
+ const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+ const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+ const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+ const __m128i C3 = _mm_srli_si128(B3, 2);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+ const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+ const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+ const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+ __m128i rowa_0 = avg3_0;
+ __m128i rowa_1 = avg3_1;
+ __m128i rowa_2 = avg3_2;
+ __m128i rowa_3 = avg3_3;
+ __m128i avg3_left[4];
+ int i, j;
+ (void)bd;
+ avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+ avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+ avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+ avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+ for (i = 0; i < 4; ++i) {
+ __m128i avg_left = avg3_left[i];
+ for (j = 0; j < 8; ++j) {
+ rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+ rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+ rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+ rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+ _mm_store_si128((__m128i *)dst, rowa_0);
+ _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+ _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+ _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+ dst += stride;
+ }
+ }
+}
+
+void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+ const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+ const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+ const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+ const __m128i XIJKLMNO =
+ _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+ const __m128i AXIJKLMN =
+ _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+ const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+ const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+ const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+ const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+ const __m128i row0 =
+ _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+ const __m128i row1 =
+ _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+ const __m128i row2 =
+ _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+ const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+ const __m128i row4 =
+ _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+ const __m128i row5 =
+ _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+ const __m128i row6 =
+ _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+ const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, row0);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row1);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row2);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row3);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row4);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row5);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row6);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row7);
+}
+
+void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_srli_si128(A1, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_srli_si128(A1, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i avg2_avg3_left[2][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ dst += stride;
+ }
+ }
+}
+
+void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+ const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+ const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+ const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+ const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+ const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+ const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+ const __m128i B3 = _mm_srli_si128(A3, 2);
+ const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+ const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+ const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+ const __m128i C3 = _mm_srli_si128(A3, 4);
+ const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+ const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+ const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+ const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+ const __m128i L0 = _mm_load_si128((const __m128i *)left);
+ const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+ const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+ const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+ const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+ const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+ const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+ const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+ const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+ const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+ const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+ const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+ const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+ const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+ const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+ const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+ const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+ const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+ const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+ __m128i row_0 = avg3_0;
+ __m128i row_1 = avg3_1;
+ __m128i row_2 = avg3_2;
+ __m128i row_3 = avg3_3;
+ __m128i avg2_avg3_left[4][2];
+ int i, j;
+ (void)bd;
+
+ avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+ avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+ avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+ avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+ avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+ for (j = 0; j < 4; ++j) {
+ for (i = 0; i < 2; ++i) {
+ const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+ row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+ row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+ row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+ _mm_store_si128((__m128i *)dst, row_0);
+ _mm_store_si128((__m128i *)(dst + 8), row_1);
+ _mm_store_si128((__m128i *)(dst + 16), row_2);
+ _mm_store_si128((__m128i *)(dst + 24), row_3);
+ dst += stride;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 000000000..94c68885c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+ const uint8_t *t, int bd, __m256i *blt,
+ __m256i *lt, __m256i *thr) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+ __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ *blt = _mm256_slli_epi16(y, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+ y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ *lt = _mm256_slli_epi16(y, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+ y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ *thr = _mm256_slli_epi16(y, shift);
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+ __m256i *p, __m256i *q) {
+ int i;
+ for (i = 0; i < size; i++) {
+ p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch));
+ q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch));
+ }
+}
+
+static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q,
+ const __m256i *t, __m256i *hev) {
+ const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0]));
+ const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0]));
+ __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+ h = _mm256_subs_epu16(h, *t);
+
+ const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+ const __m256i zero = _mm256_setzero_si256();
+ *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff);
+}
+
+static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q,
+ const __m256i *l, const __m256i *bl,
+ __m256i *mask) {
+ __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0]));
+ __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1]));
+ abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+ __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff);
+ max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one));
+
+ int i;
+ for (i = 1; i < 4; ++i) {
+ max = _mm256_max_epi16(max,
+ _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1])));
+ max = _mm256_max_epi16(max,
+ _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1])));
+ }
+ max = _mm256_subs_epu16(max, *l);
+ *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask
+}
+
+static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p,
+ const __m256i *q, int bd, int start,
+ int end, __m256i *flat) {
+ __m256i max = _mm256_setzero_si256();
+ int i;
+ for (i = start; i < end; ++i) {
+ max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0])));
+ max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0])));
+ }
+
+ __m256i ft;
+ if (bd == 8)
+ ft = _mm256_subs_epu16(max, *th);
+ else if (bd == 10)
+ ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2));
+ else // bd == 12
+ ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4));
+
+ const __m256i zero = _mm256_setzero_si256();
+ *flat = _mm256_cmpeq_epi16(ft, zero);
+}
+
+// Note:
+// Access p[3-1], p[0], and q[3-1], q[0]
+static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p,
+ const __m256i *q, __m256i *flat, int bd) {
+ // check the distance 1,2,3 against 0
+ flat_mask_internal(th, p, q, bd, 1, 4, flat);
+}
+
+// Note:
+// access p[7-4], p[0], and q[7-4], q[0]
+static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p,
+ const __m256i *q, __m256i *flat, int bd) {
+ flat_mask_internal(th, p, q, bd, 4, 8, flat);
+}
+
+static INLINE void pixel_clamp(const __m256i *min, const __m256i *max,
+ __m256i *pixel) {
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(*pixel, *max);
+ clamped = _mm256_andnot_si256(mask, *pixel);
+ mask = _mm256_and_si256(mask, *max);
+ clamped = _mm256_or_si256(mask, clamped);
+
+ mask = _mm256_cmpgt_epi16(clamped, *min);
+ clamped = _mm256_and_si256(mask, clamped);
+ mask = _mm256_andnot_si256(mask, *min);
+ *pixel = _mm256_or_si256(clamped, mask);
+}
+
+static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
+ const __m256i *th, int bd, __m256i *ps,
+ __m256i *qs) {
+ __m256i t80;
+ if (bd == 8)
+ t80 = _mm256_set1_epi16(0x80);
+ else if (bd == 10)
+ t80 = _mm256_set1_epi16(0x200);
+ else // bd == 12
+ t80 = _mm256_set1_epi16(0x800);
+
+ __m256i ps0 = _mm256_subs_epi16(p[0], t80);
+ __m256i ps1 = _mm256_subs_epi16(p[1], t80);
+ __m256i qs0 = _mm256_subs_epi16(q[0], t80);
+ __m256i qs1 = _mm256_subs_epi16(q[1], t80);
+
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i pmax = _mm256_subs_epi16(
+ _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+ __m256i filter = _mm256_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filter);
+
+ __m256i hev;
+ highbd_hev_mask(p, q, th, &hev);
+ filter = _mm256_and_si256(filter, hev);
+
+ const __m256i x = _mm256_subs_epi16(qs0, ps0);
+ filter = _mm256_adds_epi16(filter, x);
+ filter = _mm256_adds_epi16(filter, x);
+ filter = _mm256_adds_epi16(filter, x);
+ pixel_clamp(&pmin, &pmax, &filter);
+ filter = _mm256_and_si256(filter, *mask);
+
+ const __m256i t3 = _mm256_set1_epi16(3);
+ const __m256i t4 = _mm256_set1_epi16(4);
+
+ __m256i filter1 = _mm256_adds_epi16(filter, t4);
+ __m256i filter2 = _mm256_adds_epi16(filter, t3);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ filter1 = _mm256_srai_epi16(filter1, 3);
+ filter2 = _mm256_srai_epi16(filter2, 3);
+
+ qs0 = _mm256_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &qs0);
+ ps0 = _mm256_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &ps0);
+
+ qs[0] = _mm256_adds_epi16(qs0, t80);
+ ps[0] = _mm256_adds_epi16(ps0, t80);
+
+ filter = _mm256_adds_epi16(filter1, one);
+ filter = _mm256_srai_epi16(filter, 1);
+ filter = _mm256_andnot_si256(hev, filter);
+
+ qs1 = _mm256_subs_epi16(qs1, filter);
+ pixel_clamp(&pmin, &pmax, &qs1);
+ ps1 = _mm256_adds_epi16(ps1, filter);
+ pixel_clamp(&pmin, &pmax, &ps1);
+
+ qs[1] = _mm256_adds_epi16(qs1, t80);
+ ps[1] = _mm256_adds_epi16(ps1, t80);
+}
+#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
+
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
+ const uint8_t *blt,
+ const uint8_t *lt,
+ const uint8_t *thr, int bd) {
+ aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
+}
+
+void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
+ const uint8_t *blt, const uint8_t *lt,
+ const uint8_t *thr, int bd) {
+ aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+#else
+void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
+ const uint8_t *blt,
+ const uint8_t *lt,
+ const uint8_t *thr, int bd) {
+ __m256i blimit, limit, thresh;
+ get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
+
+ __m256i p[8], q[8];
+ load_highbd_pixel(s, 8, pitch, p, q);
+
+ __m256i mask;
+ highbd_filter_mask(p, q, &limit, &blimit, &mask);
+
+ __m256i flat, flat2;
+ const __m256i one = _mm256_set1_epi16(1);
+ highbd_flat_mask4(&one, p, q, &flat, bd);
+ highbd_flat_mask5(&one, p, q, &flat2, bd);
+
+ flat = _mm256_and_si256(flat, mask);
+ flat2 = _mm256_and_si256(flat2, flat);
+
+ __m256i ps[2], qs[2];
+ highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
+
+ // flat and wide flat calculations
+ __m256i flat_p[3], flat_q[3];
+ __m256i flat2_p[7], flat2_q[7];
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+
+ __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]),
+ _mm256_add_epi16(p[4], p[3]));
+ __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]),
+ _mm256_add_epi16(q[4], q[3]));
+
+ __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1]));
+ sum_p = _mm256_add_epi16(sum_p, sum_lp);
+
+ __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1]));
+ sum_q = _mm256_add_epi16(sum_q, sum_lq);
+ sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q));
+ sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq));
+
+ flat2_p[0] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4);
+ flat2_q[0] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4);
+ flat_p[0] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3);
+ flat_q[0] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3);
+
+ __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]);
+ __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]);
+ __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]);
+ __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]);
+
+ sum_q = _mm256_sub_epi16(sum_p, p[6]);
+ sum_p = _mm256_sub_epi16(sum_p, q[6]);
+ flat2_p[1] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4);
+ flat2_q[1] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4);
+
+ sum_lq = _mm256_sub_epi16(sum_lp, p[2]);
+ sum_lp = _mm256_sub_epi16(sum_lp, q[2]);
+ flat_p[1] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3);
+ flat_q[1] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3);
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
+ sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
+ sum_p3 = _mm256_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm256_add_epi16(sum_q3, q[3]);
+
+ sum_p = _mm256_sub_epi16(sum_p, q[5]);
+ sum_q = _mm256_sub_epi16(sum_q, p[5]);
+ flat2_p[2] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4);
+ flat2_q[2] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4);
+
+ sum_lp = _mm256_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm256_sub_epi16(sum_lq, p[1]);
+ flat_p[2] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3);
+ flat_q[2] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3);
+
+ int i;
+ for (i = 3; i < 7; ++i) {
+ sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
+ sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
+ sum_p = _mm256_sub_epi16(sum_p, q[7 - i]);
+ sum_q = _mm256_sub_epi16(sum_q, p[7 - i]);
+ flat2_p[i] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4);
+ flat2_q[i] = _mm256_srli_epi16(
+ _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4);
+ }
+ }
+
+ // highbd_filter8
+ p[2] = _mm256_andnot_si256(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm256_and_si256(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm256_andnot_si256(flat, q[2]);
+ flat_q[2] = _mm256_and_si256(flat, flat_q[2]);
+ q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values
+
+ int i;
+ for (i = 1; i >= 0; i--) {
+ ps[i] = _mm256_andnot_si256(flat, ps[i]);
+ flat_p[i] = _mm256_and_si256(flat, flat_p[i]);
+ p[i] = _mm256_or_si256(ps[i], flat_p[i]);
+ qs[i] = _mm256_andnot_si256(flat, qs[i]);
+ flat_q[i] = _mm256_and_si256(flat, flat_q[i]);
+ q[i] = _mm256_or_si256(qs[i], flat_q[i]);
+ }
+
+ // highbd_filter16
+
+ for (i = 6; i >= 0; i--) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm256_andnot_si256(flat2, p[i]);
+ flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values
+
+ q[i] = _mm256_andnot_si256(flat2, q[i]);
+ flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]);
+ q[i] = _mm256_or_si256(q[i], flat2_q[i]);
+ _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]);
+ _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]);
+ }
+}
+
+static INLINE void highbd_transpose16x16(uint16_t *src, int src_p,
+ uint16_t *dst, int dst_p) {
+ __m256i x[16];
+ int i;
+ for (i = 0; i < 16; ++i) {
+ x[i] = _mm256_loadu_si256((const __m256i *)src);
+ src += src_p;
+ }
+ mm256_transpose_16x16(x, x);
+ for (i = 0; i < 16; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, x[i]);
+ dst += dst_p;
+ }
+}
+
+void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+ // Transpose 16x16
+ highbd_transpose16x16(s - 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit,
+ thresh, bd);
+
+ // Transpose back
+ highbd_transpose16x16(t_dst, 16, s - 8, p);
+}
+
+static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0,
+ const uint8_t *t0, const uint8_t *b1,
+ const uint8_t *l1, const uint8_t *t1, int bd,
+ __m256i *blt, __m256i *lt, __m256i *thr) {
+ const __m128i z128 = _mm_setzero_si128();
+ const __m128i blimit0 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128);
+ const __m128i limit0 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128);
+ const __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128);
+ const __m128i blimit1 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128);
+ const __m128i limit1 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128);
+ const __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128);
+
+ *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1);
+ *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1);
+ *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1);
+
+ int shift = bd - 8;
+ *blt = _mm256_slli_epi16(*blt, shift);
+ *lt = _mm256_slli_epi16(*lt, shift);
+ *thr = _mm256_slli_epi16(*thr, shift);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
+ __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+ __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
+ __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
+ __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p));
+ __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
+ __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+ __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
+
+ const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
+ const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
+
+ __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
+ __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
+
+ __m256i blimit, limit, thresh;
+ get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit, &limit, &thresh);
+
+ __m256i t80, tff80, tffe0, t1f, t7f;
+ if (bd == 8) {
+ t80 = _mm256_set1_epi16(0x80);
+ tff80 = _mm256_set1_epi16(0xff80);
+ tffe0 = _mm256_set1_epi16(0xffe0);
+ t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8);
+ t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2);
+ tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2);
+ tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2);
+ t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6);
+ t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4);
+ tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4);
+ tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4);
+ t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4);
+ t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4);
+ }
+
+ __m256i ps1 =
+ _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80);
+ __m256i ps0 =
+ _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80);
+ __m256i qs0 =
+ _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80);
+ __m256i qs1 =
+ _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80);
+
+ // filter_mask and hev_mask
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+ __m256i hev = _mm256_subs_epu16(flat, thresh);
+ const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+ hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+ __m256i mask =
+ _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ const __m256i one = _mm256_set1_epi16(1);
+ mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
+ mask = _mm256_max_epi16(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ __m256i work = _mm256_max_epi16(
+ _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)),
+ _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3)));
+ mask = _mm256_max_epi16(work, mask);
+ work = _mm256_max_epi16(
+ _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)),
+ _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3)));
+ mask = _mm256_max_epi16(work, mask);
+ mask = _mm256_subs_epu16(mask, limit);
+ mask = _mm256_cmpeq_epi16(mask, zero);
+
+ // filter4
+ const __m256i pmax = _mm256_subs_epi16(
+ _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+ const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+ __m256i filt = _mm256_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filt);
+ filt = _mm256_and_si256(filt, hev);
+ __m256i work_a = _mm256_subs_epi16(qs0, ps0);
+ filt = _mm256_adds_epi16(filt, work_a);
+ filt = _mm256_adds_epi16(filt, work_a);
+ filt = _mm256_adds_epi16(filt, work_a);
+ pixel_clamp(&pmin, &pmax, &filt);
+
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm256_and_si256(filt, mask);
+
+ const __m256i t4 = _mm256_set1_epi16(4);
+ const __m256i t3 = _mm256_set1_epi16(3);
+
+ __m256i filter1 = _mm256_adds_epi16(filt, t4);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ __m256i filter2 = _mm256_adds_epi16(filt, t3);
+ pixel_clamp(&pmin, &pmax, &filter2);
+
+ // Filter1 >> 3
+ work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0
+ filter1 = _mm256_srli_epi16(filter1, 3);
+ work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm256_and_si256(filter1, t1f); // clamp the range
+ filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits
+
+ // Filter2 >> 3
+ work_a = _mm256_cmpgt_epi16(zero, filter2);
+ filter2 = _mm256_srli_epi16(filter2, 3);
+ work_a = _mm256_and_si256(work_a, tffe0);
+ filter2 = _mm256_and_si256(filter2, t1f);
+ filter2 = _mm256_or_si256(filter2, work_a);
+
+ // filt >> 1
+ // equivalent to shifting 0x1f left by bitdepth - 8
+ // and setting new bits to 1
+ filt = _mm256_adds_epi16(filter1, one);
+ work_a = _mm256_cmpgt_epi16(zero, filt);
+ filt = _mm256_srli_epi16(filt, 1);
+ work_a = _mm256_and_si256(work_a, tff80);
+ filt = _mm256_and_si256(filt, t7f);
+ filt = _mm256_or_si256(filt, work_a);
+
+ filt = _mm256_andnot_si256(hev, filt);
+
+ filter1 = _mm256_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ q0 = _mm256_adds_epi16(filter1, t80);
+
+ filter1 = _mm256_subs_epi16(qs1, filt);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ q1 = _mm256_adds_epi16(filter1, t80);
+
+ filter2 = _mm256_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ p0 = _mm256_adds_epi16(filter2, t80);
+
+ filter2 = _mm256_adds_epi16(ps1, filt);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ p1 = _mm256_adds_epi16(filter2, t80);
+
+ _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
+ _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
+ _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
+ _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+
+ __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
+ __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
+ __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+ __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+ __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
+ __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
+ __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
+ __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p));
+
+ __m256i blimit, limit, thresh;
+ get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit, &limit, &thresh);
+
+ __m256i t80;
+ if (bd == 8) {
+ t80 = _mm256_set1_epi16(0x80);
+ } else if (bd == 10) {
+ t80 = _mm256_set1_epi16(0x200);
+ } else { // bd == 12
+ t80 = _mm256_set1_epi16(0x800);
+ }
+
+ __m256i ps1, ps0, qs0, qs1;
+ ps1 = _mm256_subs_epi16(p1, t80);
+ ps0 = _mm256_subs_epi16(p0, t80);
+ qs0 = _mm256_subs_epi16(q0, t80);
+ qs1 = _mm256_subs_epi16(q1, t80);
+
+ // filter_mask and hev_mask
+ __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
+ abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
+
+ abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
+ abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
+ __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
+ __m256i hev = _mm256_subs_epu16(flat, thresh);
+ const __m256i zero = _mm256_set1_epi16(0);
+ const __m256i ffff = _mm256_set1_epi16(0xFFFF);
+ hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
+ __m256i mask =
+ _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+
+ const __m256i one = _mm256_set1_epi16(1);
+ mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
+ mask = _mm256_max_epi16(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ mask = _mm256_max_epi16(abs_q1q0, mask);
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)),
+ _mm256_abs_epi16(_mm256_sub_epi16(q2, q1)));
+ mask = _mm256_max_epi16(work, mask);
+ work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)),
+ _mm256_abs_epi16(_mm256_sub_epi16(q3, q2)));
+ mask = _mm256_max_epi16(work, mask);
+ mask = _mm256_subs_epu16(mask, limit);
+ mask = _mm256_cmpeq_epi16(mask, zero);
+
+ // flat_mask4
+ flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)),
+ _mm256_abs_epi16(_mm256_sub_epi16(q2, q0)));
+ work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)),
+ _mm256_abs_epi16(_mm256_sub_epi16(q3, q0)));
+ flat = _mm256_max_epi16(work, flat);
+ flat = _mm256_max_epi16(abs_p1p0, flat);
+ flat = _mm256_max_epi16(abs_q1q0, flat);
+
+ if (bd == 8)
+ flat = _mm256_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4));
+
+ flat = _mm256_cmpeq_epi16(flat, zero);
+ flat = _mm256_and_si256(flat, mask); // flat & mask
+
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+ __m256i workp_a, workp_b, workp_shft;
+ workp_a =
+ _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1));
+ const __m256i four = _mm256_set1_epi16(4);
+ workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0);
+ workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft);
+
+ workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft);
+
+ workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft);
+
+ workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft);
+
+ workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft);
+
+ workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
+ _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft);
+
+ // lp filter
+ const __m256i pmax = _mm256_subs_epi16(
+ _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
+ const __m256i pmin = _mm256_subs_epi16(zero, t80);
+
+ __m256i filt, filter1, filter2, work_a;
+ filt = _mm256_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filt);
+ filt = _mm256_and_si256(filt, hev);
+ work_a = _mm256_subs_epi16(qs0, ps0);
+ filt = _mm256_adds_epi16(filt, work_a);
+ filt = _mm256_adds_epi16(filt, work_a);
+ filt = _mm256_adds_epi16(filt, work_a);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ pixel_clamp(&pmin, &pmax, &filt);
+ filt = _mm256_and_si256(filt, mask);
+
+ const __m256i t4 = _mm256_set1_epi16(4);
+ const __m256i t3 = _mm256_set1_epi16(3);
+
+ filter1 = _mm256_adds_epi16(filt, t4);
+ filter2 = _mm256_adds_epi16(filt, t3);
+
+ // Filter1 >> 3
+ pixel_clamp(&pmin, &pmax, &filter1);
+ filter1 = _mm256_srai_epi16(filter1, 3);
+
+ // Filter2 >> 3
+ pixel_clamp(&pmin, &pmax, &filter2);
+ filter2 = _mm256_srai_epi16(filter2, 3);
+
+ // filt >> 1
+ filt = _mm256_adds_epi16(filter1, one);
+ filt = _mm256_srai_epi16(filt, 1);
+ // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+ filt = _mm256_andnot_si256(hev, filt);
+
+ work_a = _mm256_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &work_a);
+ work_a = _mm256_adds_epi16(work_a, t80);
+ q0 = _mm256_loadu_si256((__m256i *)flat_oq0);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ q0 = _mm256_and_si256(flat, q0);
+ q0 = _mm256_or_si256(work_a, q0);
+
+ work_a = _mm256_subs_epi16(qs1, filt);
+ pixel_clamp(&pmin, &pmax, &work_a);
+ work_a = _mm256_adds_epi16(work_a, t80);
+ q1 = _mm256_loadu_si256((__m256i *)flat_oq1);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ q1 = _mm256_and_si256(flat, q1);
+ q1 = _mm256_or_si256(work_a, q1);
+
+ work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p));
+ q2 = _mm256_loadu_si256((__m256i *)flat_oq2);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ q2 = _mm256_and_si256(flat, q2);
+ q2 = _mm256_or_si256(work_a, q2);
+
+ work_a = _mm256_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &work_a);
+ work_a = _mm256_adds_epi16(work_a, t80);
+ p0 = _mm256_loadu_si256((__m256i *)flat_op0);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ p0 = _mm256_and_si256(flat, p0);
+ p0 = _mm256_or_si256(work_a, p0);
+
+ work_a = _mm256_adds_epi16(ps1, filt);
+ pixel_clamp(&pmin, &pmax, &work_a);
+ work_a = _mm256_adds_epi16(work_a, t80);
+ p1 = _mm256_loadu_si256((__m256i *)flat_op1);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ p1 = _mm256_and_si256(flat, p1);
+ p1 = _mm256_or_si256(work_a, p1);
+
+ work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p));
+ p2 = _mm256_loadu_si256((__m256i *)flat_op2);
+ work_a = _mm256_andnot_si256(flat, work_a);
+ p2 = _mm256_and_si256(flat, p2);
+ p2 = _mm256_or_si256(work_a, p2);
+
+ _mm256_storeu_si256((__m256i *)(s - 3 * p), p2);
+ _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
+ _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
+ _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
+ _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
+ _mm256_storeu_si256((__m256i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 76369871b..0a399edf2 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -12,135 +12,135 @@
#include <emmintrin.h> // SSE2
#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
#include "aom_ports/emmintrin_compat.h"
+#include "aom_ports/mem.h"
-static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
- __m128i ubounded;
- __m128i lbounded;
- __m128i retval;
+static INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+ __m128i *pixel) {
+ __m128i clamped, mask;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- __m128i t80, max, min;
+ mask = _mm_cmpgt_epi16(*pixel, *max);
+ clamped = _mm_andnot_si128(mask, *pixel);
+ mask = _mm_and_si128(mask, *max);
+ clamped = _mm_or_si128(mask, clamped);
- if (bd == 8) {
- t80 = _mm_set1_epi16(0x80);
- max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
- } else if (bd == 10) {
- t80 = _mm_set1_epi16(0x200);
- max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
- } else { // bd == 12
- t80 = _mm_set1_epi16(0x800);
- max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
- }
+ mask = _mm_cmpgt_epi16(clamped, *min);
+ clamped = _mm_and_si128(mask, clamped);
+ mask = _mm_andnot_si128(mask, *min);
+ *pixel = _mm_or_si128(clamped, mask);
+}
- min = _mm_subs_epi16(zero, t80);
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+ const uint8_t *t, int bd, __m128i *blt,
+ __m128i *lt, __m128i *thr) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
- ubounded = _mm_cmpgt_epi16(value, max);
- lbounded = _mm_cmplt_epi16(value, min);
- retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
- ubounded = _mm_and_si128(ubounded, max);
- lbounded = _mm_and_si128(lbounded, min);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_or_si128(retval, lbounded);
- return retval;
-}
+ __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+ *blt = _mm_slli_epi16(x, shift);
-// TODO(debargha, peter): Break up large functions into smaller ones
-// in this file.
-void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- __m128i blimit, limit, thresh;
- __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
- __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
- __m128i ps1, qs1, ps0, qs0;
- __m128i abs_p0q0, abs_p1q1, ffff, work;
- __m128i filt, work_a, filter1, filter2;
- __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
- __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
- __m128i flat2_q0, flat2_p0;
- __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3;
- __m128i t4, t3, t80, t1;
- __m128i eight, four;
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+ *lt = _mm_slli_epi16(x, shift);
- if (bd == 8) {
- blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
- limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
- thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
- } else if (bd == 10) {
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
- } else { // bd == 12
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+ *thr = _mm_slli_epi16(x, shift);
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+ __m128i *p, __m128i *q) {
+ int i;
+ for (i = 0; i < size; i++) {
+ p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
}
+}
+// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q,
+ const __m128i *t, __m128i *hev) {
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1]));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1]));
+ __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ h = _mm_subs_epu16(h, *t);
- q4 = _mm_load_si128((__m128i *)(s + 4 * p));
- p4 = _mm_load_si128((__m128i *)(s - 5 * p));
- q3 = _mm_load_si128((__m128i *)(s + 3 * p));
- p3 = _mm_load_si128((__m128i *)(s - 4 * p));
- q2 = _mm_load_si128((__m128i *)(s + 2 * p));
- p2 = _mm_load_si128((__m128i *)(s - 3 * p));
- q1 = _mm_load_si128((__m128i *)(s + 1 * p));
- p1 = _mm_load_si128((__m128i *)(s - 2 * p));
- q0 = _mm_load_si128((__m128i *)(s + 0 * p));
- p0 = _mm_load_si128((__m128i *)(s - 1 * p));
-
- // highbd_filter_mask
- abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
- abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+ const __m128i ffff = _mm_set1_epi16(0xFFFF);
+ const __m128i zero = _mm_setzero_si128();
+ *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+}
- ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
+ const __m128i *l, const __m128i *bl,
+ __m128i *mask) {
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0]));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1]));
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
- abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16(0xFFFF);
+ __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ int i;
+ for (i = 1; i < 4; ++i) {
+ max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]),
+ _mm_subs_epu16(p[i - 1], p[i])));
+ max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]),
+ _mm_subs_epu16(q[i - 1], q[i])));
+ }
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // return ~mask
+}
- // highbd_hev_mask (in C code this is actually called from highbd_filter4)
- flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu16(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
+ const __m128i *q, int bd, int start,
+ int end, __m128i *flat) {
+ __m128i max = _mm_setzero_si128();
+ int i;
+ for (i = start; i < end; ++i) {
+ max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]),
+ _mm_subs_epu16(p[0], p[i])));
+ max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]),
+ _mm_subs_epu16(q[0], q[i])));
+ }
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
- _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
- mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
- _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
- mask = _mm_max_epi16(work, mask);
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
- _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
- mask = _mm_max_epi16(work, mask);
+ __m128i ft;
+ if (bd == 8)
+ ft = _mm_subs_epu16(max, *th);
+ else if (bd == 10)
+ ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
+ else // bd == 12
+ ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
- mask = _mm_subs_epu16(mask, limit);
- mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
- // lp filter
- // highbd_filter4
- t4 = _mm_set1_epi16(4);
- t3 = _mm_set1_epi16(3);
+// Note:
+// Access p[3-1], p[0], and q[3-1], q[0]
+static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
+ const __m128i *q, __m128i *flat, int bd) {
+ // check the distance 1,2,3 against 0
+ flat_mask_internal(th, p, q, bd, 1, 4, flat);
+}
+
+// Note:
+// access p[7-4], p[0], and q[7-4], q[0]
+static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
+ const __m128i *q, __m128i *flat, int bd) {
+ flat_mask_internal(th, p, q, bd, 4, 8, flat);
+}
+
+static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
+ const __m128i *th, int bd, __m128i *ps,
+ __m128i *qs) {
+ __m128i t80;
if (bd == 8)
t80 = _mm_set1_epi16(0x80);
else if (bd == 10)
@@ -148,340 +148,283 @@ void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
else // bd == 12
t80 = _mm_set1_epi16(0x800);
- t1 = _mm_set1_epi16(0x1);
+ __m128i ps0 = _mm_subs_epi16(p[0], t80);
+ __m128i ps1 = _mm_subs_epi16(p[1], t80);
+ __m128i qs0 = _mm_subs_epi16(q[0], t80);
+ __m128i qs1 = _mm_subs_epi16(q[1], t80);
- ps1 = _mm_subs_epi16(p1, t80);
- qs1 = _mm_subs_epi16(q1, t80);
- ps0 = _mm_subs_epi16(p0, t80);
- qs0 = _mm_subs_epi16(q0, t80);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i pmin = _mm_subs_epi16(zero, t80);
- filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
- hev);
- work_a = _mm_subs_epi16(qs0, ps0);
- filt = _mm_adds_epi16(filt, work_a);
- filt = _mm_adds_epi16(filt, work_a);
- filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
- filt = _mm_and_si128(filt, mask);
- filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
- filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+ __m128i filter = _mm_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filter);
- // Filter1 >> 3
- filter1 = _mm_srai_epi16(filter1, 0x3);
- filter2 = _mm_srai_epi16(filter2, 0x3);
+ __m128i hev;
+ highbd_hev_mask(p, q, th, &hev);
+ filter = _mm_and_si128(filter, hev);
- qs0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
- ps0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(hev, filt);
- qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
- t80);
- ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
- t80);
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ pixel_clamp(&pmin, &pmax, &filter);
+ filter = _mm_and_si128(filter, *mask);
- // end highbd_filter4
- // loopfilter done
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t4 = _mm_set1_epi16(4);
- // highbd_flat_mask4
- flat = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
- _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
- _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
- flat = _mm_max_epi16(work, flat);
- work = _mm_max_epi16(abs_p1p0, abs_q1q0);
- flat = _mm_max_epi16(work, flat);
+ __m128i filter1 = _mm_adds_epi16(filter, t4);
+ __m128i filter2 = _mm_adds_epi16(filter, t3);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ filter1 = _mm_srai_epi16(filter1, 3);
+ filter2 = _mm_srai_epi16(filter2, 3);
- if (bd == 8)
- flat = _mm_subs_epu16(flat, one);
- else if (bd == 10)
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+ qs0 = _mm_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &qs0);
+ ps0 = _mm_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &ps0);
- flat = _mm_cmpeq_epi16(flat, zero);
- // end flat_mask4
+ qs[0] = _mm_adds_epi16(qs0, t80);
+ ps[0] = _mm_adds_epi16(ps0, t80);
- // flat & mask = flat && mask (as used in filter8)
- // (because, in both vars, each block of 16 either all 1s or all 0s)
- flat = _mm_and_si128(flat, mask);
+ filter = _mm_adds_epi16(filter1, one);
+ filter = _mm_srai_epi16(filter, 1);
+ filter = _mm_andnot_si128(hev, filter);
- p5 = _mm_load_si128((__m128i *)(s - 6 * p));
- q5 = _mm_load_si128((__m128i *)(s + 5 * p));
- p6 = _mm_load_si128((__m128i *)(s - 7 * p));
- q6 = _mm_load_si128((__m128i *)(s + 6 * p));
- p7 = _mm_load_si128((__m128i *)(s - 8 * p));
- q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+ qs1 = _mm_subs_epi16(qs1, filter);
+ pixel_clamp(&pmin, &pmax, &qs1);
+ ps1 = _mm_adds_epi16(ps1, filter);
+ pixel_clamp(&pmin, &pmax, &ps1);
- // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
- // but referred to as p0-p4 & q0-q4 in fn)
- flat2 = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
- _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+ qs[1] = _mm_adds_epi16(qs1, t80);
+ ps[1] = _mm_adds_epi16(ps1, t80);
+}
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
- _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
- flat2 = _mm_max_epi16(work, flat2);
+typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput;
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
- _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
- flat2 = _mm_max_epi16(work, flat2);
+static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
+ const uint8_t *blt,
+ const uint8_t *lt,
+ const uint8_t *thr, int bd,
+ PixelOutput pixel_output) {
+ __m128i blimit, limit, thresh;
+ get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
- work = _mm_max_epi16(
- _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
- _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
- flat2 = _mm_max_epi16(work, flat2);
+ __m128i p[8], q[8];
+ load_highbd_pixel(s, 8, pitch, p, q);
- if (bd == 8)
- flat2 = _mm_subs_epu16(flat2, one);
- else if (bd == 10)
- flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+ __m128i mask;
+ highbd_filter_mask(p, q, &limit, &blimit, &mask);
+
+ __m128i flat, flat2;
+ const __m128i one = _mm_set1_epi16(1);
+ highbd_flat_mask4(&one, p, q, &flat, bd);
+ highbd_flat_mask5(&one, p, q, &flat2, bd);
+
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_and_si128(flat2, flat);
- flat2 = _mm_cmpeq_epi16(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- // end highbd_flat_mask5
+ __m128i ps[2], qs[2];
+ highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// flat and wide flat calculations
- eight = _mm_set1_epi16(8);
- four = _mm_set1_epi16(4);
-
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p =
- _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(
- four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- flat2_p0 =
- _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
- flat2_q0 =
- _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
- flat_p0 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
- flat_q0 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
-
- sum_p7 = _mm_add_epi16(p7, p7);
- sum_q7 = _mm_add_epi16(q7, q7);
- sum_p3 = _mm_add_epi16(p3, p3);
- sum_q3 = _mm_add_epi16(q3, q3);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
- flat2_p1 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
- flat2_q1 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
- flat_p1 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
- flat_q1 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7);
- sum_q7 = _mm_add_epi16(sum_q7, q7);
- sum_p3 = _mm_add_epi16(sum_p3, p3);
- sum_q3 = _mm_add_epi16(sum_q3, q3);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
- flat2_p2 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
- flat2_q2 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
- flat_p2 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
- flat_q2 = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7);
- sum_q7 = _mm_add_epi16(sum_q7, q7);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
- flat2_p3 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
- flat2_q3 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7);
- sum_q7 = _mm_add_epi16(sum_q7, q7);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
- flat2_p4 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
- flat2_q4 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7);
- sum_q7 = _mm_add_epi16(sum_q7, q7);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
- flat2_p5 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
- flat2_q5 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7);
- sum_q7 = _mm_add_epi16(sum_q7, q7);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
- flat2_p6 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
- flat2_q6 = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- // highbd_filter8
- p2 = _mm_andnot_si128(flat, p2);
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[7], flat2_q[7];
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+
+ __m128i sum_p =
+ _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3]));
+ __m128i sum_q =
+ _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3]));
+
+ __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+ sum_q = _mm_add_epi16(sum_q, sum_lq);
+ sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat2_p[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4);
+ flat2_q[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4);
+ flat_p[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+ flat_q[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+
+ __m128i sum_p7 = _mm_add_epi16(p[7], p[7]);
+ __m128i sum_q7 = _mm_add_epi16(q[7], q[7]);
+ __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+ __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+ sum_q = _mm_sub_epi16(sum_p, p[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[6]);
+ flat2_p[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4);
+ flat2_q[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4);
+
+ sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+ flat_p[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+ flat_q[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p[7]);
+ sum_q7 = _mm_add_epi16(sum_q7, q[7]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+
+ sum_p = _mm_sub_epi16(sum_p, q[5]);
+ sum_q = _mm_sub_epi16(sum_q, p[5]);
+ flat2_p[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4);
+ flat2_q[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ flat_p[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+ flat_q[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+ int i;
+ for (i = 3; i < 7; ++i) {
+ sum_p7 = _mm_add_epi16(sum_p7, p[7]);
+ sum_q7 = _mm_add_epi16(sum_q7, q[7]);
+ sum_p = _mm_sub_epi16(sum_p, q[7 - i]);
+ sum_q = _mm_sub_epi16(sum_q, p[7 - i]);
+ flat2_p[i] =
+ _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4);
+ flat2_q[i] =
+ _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4);
+ }
+ }
+
+ // highbd_filter8
+ p[2] = _mm_andnot_si128(flat, p[2]);
// p2 remains unchanged if !(flat && mask)
- flat_p2 = _mm_and_si128(flat, flat_p2);
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
// when (flat && mask)
- p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
- q2 = _mm_andnot_si128(flat, q2);
- flat_q2 = _mm_and_si128(flat, flat_q2);
- q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
-
- ps1 = _mm_andnot_si128(flat, ps1);
- // p1 takes the value assigned to in in filter4 if !(flat && mask)
- flat_p1 = _mm_and_si128(flat, flat_p1);
- // when (flat && mask)
- p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
- qs1 = _mm_andnot_si128(flat, qs1);
- flat_q1 = _mm_and_si128(flat, flat_q1);
- q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
-
- ps0 = _mm_andnot_si128(flat, ps0);
- // p0 takes the value assigned to in in filter4 if !(flat && mask)
- flat_p0 = _mm_and_si128(flat, flat_p0);
- // when (flat && mask)
- p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
- qs0 = _mm_andnot_si128(flat, qs0);
- flat_q0 = _mm_and_si128(flat, flat_q0);
- q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
- // end highbd_filter8
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+
+ int i;
+ for (i = 1; i >= 0; i--) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
// highbd_filter16
- p6 = _mm_andnot_si128(flat2, p6);
- // p6 remains unchanged if !(flat2 && flat && mask)
- flat2_p6 = _mm_and_si128(flat2, flat2_p6);
- // get values for when (flat2 && flat && mask)
- p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
- q6 = _mm_andnot_si128(flat2, q6);
- // q6 remains unchanged if !(flat2 && flat && mask)
- flat2_q6 = _mm_and_si128(flat2, flat2_q6);
- // get values for when (flat2 && flat && mask)
- q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
- _mm_store_si128((__m128i *)(s - 7 * p), p6);
- _mm_store_si128((__m128i *)(s + 6 * p), q6);
-
- p5 = _mm_andnot_si128(flat2, p5);
- // p5 remains unchanged if !(flat2 && flat && mask)
- flat2_p5 = _mm_and_si128(flat2, flat2_p5);
- // get values for when (flat2 && flat && mask)
- p5 = _mm_or_si128(p5, flat2_p5);
- // full list of p5 values
- q5 = _mm_andnot_si128(flat2, q5);
- // q5 remains unchanged if !(flat2 && flat && mask)
- flat2_q5 = _mm_and_si128(flat2, flat2_q5);
- // get values for when (flat2 && flat && mask)
- q5 = _mm_or_si128(q5, flat2_q5);
- // full list of q5 values
- _mm_store_si128((__m128i *)(s - 6 * p), p5);
- _mm_store_si128((__m128i *)(s + 5 * p), q5);
-
- p4 = _mm_andnot_si128(flat2, p4);
- // p4 remains unchanged if !(flat2 && flat && mask)
- flat2_p4 = _mm_and_si128(flat2, flat2_p4);
- // get values for when (flat2 && flat && mask)
- p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
- q4 = _mm_andnot_si128(flat2, q4);
- // q4 remains unchanged if !(flat2 && flat && mask)
- flat2_q4 = _mm_and_si128(flat2, flat2_q4);
- // get values for when (flat2 && flat && mask)
- q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
- _mm_store_si128((__m128i *)(s - 5 * p), p4);
- _mm_store_si128((__m128i *)(s + 4 * p), q4);
-
- p3 = _mm_andnot_si128(flat2, p3);
- // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_p3 = _mm_and_si128(flat2, flat2_p3);
- // get values for when (flat2 && flat && mask)
- p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
- q3 = _mm_andnot_si128(flat2, q3);
- // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_q3 = _mm_and_si128(flat2, flat2_q3);
- // get values for when (flat2 && flat && mask)
- q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
- _mm_store_si128((__m128i *)(s - 4 * p), p3);
- _mm_store_si128((__m128i *)(s + 3 * p), q3);
-
- p2 = _mm_andnot_si128(flat2, p2);
- // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_p2 = _mm_and_si128(flat2, flat2_p2);
- // get values for when (flat2 && flat && mask)
- p2 = _mm_or_si128(p2, flat2_p2);
- // full list of p2 values
- q2 = _mm_andnot_si128(flat2, q2);
- // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_q2 = _mm_and_si128(flat2, flat2_q2);
- // get values for when (flat2 && flat && mask)
- q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
- _mm_store_si128((__m128i *)(s - 3 * p), p2);
- _mm_store_si128((__m128i *)(s + 2 * p), q2);
-
- p1 = _mm_andnot_si128(flat2, p1);
- // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_p1 = _mm_and_si128(flat2, flat2_p1);
- // get values for when (flat2 && flat && mask)
- p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
- q1 = _mm_andnot_si128(flat2, q1);
- // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_q1 = _mm_and_si128(flat2, flat2_q1);
- // get values for when (flat2 && flat && mask)
- q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
- _mm_store_si128((__m128i *)(s - 2 * p), p1);
- _mm_store_si128((__m128i *)(s + 1 * p), q1);
-
- p0 = _mm_andnot_si128(flat2, p0);
- // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_p0 = _mm_and_si128(flat2, flat2_p0);
- // get values for when (flat2 && flat && mask)
- p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
- q0 = _mm_andnot_si128(flat2, q0);
- // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
- flat2_q0 = _mm_and_si128(flat2, flat2_q0);
- // get values for when (flat2 && flat && mask)
- q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
- _mm_store_si128((__m128i *)(s - 1 * p), p0);
- _mm_store_si128((__m128i *)(s - 0 * p), q0);
+
+ if (pixel_output == FOUR_PIXELS) {
+ for (i = 6; i >= 0; i--) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
+ }
+ } else { // EIGHT_PIXELS
+ for (i = 6; i >= 0; i--) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+ }
+ }
+}
+
+// Note:
+// highbd_lpf_horz_edge_8_8p() output 8 pixels per register
+// highbd_lpf_horz_edge_8_4p() output 4 pixels per register
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch,
+ const uint8_t *blt,
+ const uint8_t *lt,
+ const uint8_t *thr, int bd) {
+ highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS);
+}
+#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+
+static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch,
+ const uint8_t *blt,
+ const uint8_t *lt,
+ const uint8_t *thr, int bd) {
+ highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS);
+}
+
+void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+ highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
+#else
+ highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
+#endif
}
void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
const uint8_t *_blimit,
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
- aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
- aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+ highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
+#else
+ highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
+ highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd);
+#endif
+}
+
+static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
+ const __m128i *p0, const __m128i *q0,
+ const __m128i *q1, const __m128i *q2,
+ int p, uint16_t *s) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+ _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
+#else
+ _mm_store_si128((__m128i *)(s - 3 * p), *p2);
+ _mm_store_si128((__m128i *)(s - 2 * p), *p1);
+ _mm_store_si128((__m128i *)(s - 1 * p), *p0);
+ _mm_store_si128((__m128i *)(s + 0 * p), *q0);
+ _mm_store_si128((__m128i *)(s + 1 * p), *q1);
+ _mm_store_si128((__m128i *)(s + 2 * p), *q2);
+#endif
}
void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
@@ -497,14 +440,14 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
const __m128i zero = _mm_set1_epi16(0);
__m128i blimit, limit, thresh;
__m128i mask, hev, flat;
- __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
- __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
- __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
- __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
- __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
- __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
- __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_cmpeq_epi16(one, one);
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -635,41 +578,48 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
_mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
// lp filter
- filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+ const __m128i pmin = _mm_subs_epi16(zero, t80);
+
+ filt = _mm_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filt);
+
filt = _mm_and_si128(filt, hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
// (aom_filter + 3 * (qs0 - ps0)) & mask
- filt = signed_char_clamp_bd_sse2(filt, bd);
+ pixel_clamp(&pmin, &pmax, &filt);
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi16(filt, t4);
filter2 = _mm_adds_epi16(filt, t3);
// Filter1 >> 3
- filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+ pixel_clamp(&pmin, &pmax, &filter1);
filter1 = _mm_srai_epi16(filter1, 3);
// Filter2 >> 3
- filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+ pixel_clamp(&pmin, &pmax, &filter2);
filter2 = _mm_srai_epi16(filter2, 3);
// filt >> 1
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
- // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
filt = _mm_andnot_si128(hev, filt);
- work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+ work_a = _mm_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &work_a);
work_a = _mm_adds_epi16(work_a, t80);
q0 = _mm_load_si128((__m128i *)flat_oq0);
work_a = _mm_andnot_si128(flat, work_a);
q0 = _mm_and_si128(flat, q0);
q0 = _mm_or_si128(work_a, q0);
- work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+ work_a = _mm_subs_epi16(qs1, filt);
+ pixel_clamp(&pmin, &pmax, &work_a);
work_a = _mm_adds_epi16(work_a, t80);
q1 = _mm_load_si128((__m128i *)flat_oq1);
work_a = _mm_andnot_si128(flat, work_a);
@@ -682,14 +632,16 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
q2 = _mm_and_si128(flat, q2);
q2 = _mm_or_si128(work_a, q2);
- work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+ work_a = _mm_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &work_a);
work_a = _mm_adds_epi16(work_a, t80);
p0 = _mm_load_si128((__m128i *)flat_op0);
work_a = _mm_andnot_si128(flat, work_a);
p0 = _mm_and_si128(flat, p0);
p0 = _mm_or_si128(work_a, p0);
- work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+ work_a = _mm_adds_epi16(ps1, filt);
+ pixel_clamp(&pmin, &pmax, &work_a);
work_a = _mm_adds_epi16(work_a, t80);
p1 = _mm_load_si128((__m128i *)flat_op1);
work_a = _mm_andnot_si128(flat, work_a);
@@ -702,12 +654,7 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2);
- _mm_store_si128((__m128i *)(s - 3 * p), p2);
- _mm_store_si128((__m128i *)(s - 2 * p), p1);
- _mm_store_si128((__m128i *)(s - 1 * p), p0);
- _mm_store_si128((__m128i *)(s + 0 * p), q0);
- _mm_store_si128((__m128i *)(s + 1 * p), q1);
- _mm_store_si128((__m128i *)(s + 2 * p), q2);
+ store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s);
}
void aom_highbd_lpf_horizontal_8_dual_sse2(
@@ -725,14 +672,18 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
const __m128i zero = _mm_set1_epi16(0);
__m128i blimit, limit, thresh;
__m128i mask, hev, flat;
+#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
__m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+#endif
__m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
__m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
__m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
__m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
__m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
__m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+#endif
const __m128i abs_p1p0 =
_mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
const __m128i abs_q1q0 =
@@ -743,7 +694,7 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
_mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
__m128i abs_p1q1 =
_mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
- __m128i work;
+
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
__m128i t80;
@@ -814,9 +765,9 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
// So taking maximums continues to work:
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
mask = _mm_max_epi16(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epi16(
+
+#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
+ __m128i work = _mm_max_epi16(
_mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
_mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
mask = _mm_max_epi16(work, mask);
@@ -824,22 +775,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
_mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
_mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
+#endif
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero);
// filter4
- filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+ const __m128i pmin = _mm_subs_epi16(zero, t80);
+
+ filt = _mm_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filt);
filt = _mm_and_si128(filt, hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
- filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ filt = _mm_adds_epi16(filt, work_a);
+ pixel_clamp(&pmin, &pmax, &filt);
// (aom_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
- filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
- filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+ filter1 = _mm_adds_epi16(filt, t4);
+ pixel_clamp(&pmin, &pmax, &filter1);
+
+ filter2 = _mm_adds_epi16(filt, t3);
+ pixel_clamp(&pmin, &pmax, &filter2);
// Filter1 >> 3
work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
@@ -865,19 +826,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
filt = _mm_andnot_si128(hev, filt);
- q0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
- q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
- t80);
- p0 = _mm_adds_epi16(
- signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
- p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
- t80);
-
+ q0 = _mm_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &q0);
+ q0 = _mm_adds_epi16(q0, t80);
+
+ q1 = _mm_subs_epi16(qs1, filt);
+ pixel_clamp(&pmin, &pmax, &q1);
+ q1 = _mm_adds_epi16(q1, t80);
+
+ p0 = _mm_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &p0);
+ p0 = _mm_adds_epi16(p0, t80);
+
+ p1 = _mm_adds_epi16(ps1, filt);
+ pixel_clamp(&pmin, &pmax, &p1);
+ p1 = _mm_adds_epi16(p1, t80);
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+ _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+#else
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
_mm_storeu_si128((__m128i *)(s + 0 * p), q0);
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+#endif
}
void aom_highbd_lpf_horizontal_4_dual_sse2(
@@ -888,118 +862,6 @@ void aom_highbd_lpf_horizontal_4_dual_sse2(
aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
}
-static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
- int out_p, int num_8x8_to_transpose) {
- int idx8x8 = 0;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
- do {
- uint16_t *in = src[idx8x8];
- uint16_t *out = dst[idx8x8];
-
- p0 =
- _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
- p1 =
- _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
- p2 =
- _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
- p3 =
- _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
- p4 =
- _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
- p5 =
- _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
- p6 =
- _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
- p7 =
- _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
- // 00 10 01 11 02 12 03 13
- x0 = _mm_unpacklo_epi16(p0, p1);
- // 20 30 21 31 22 32 23 33
- x1 = _mm_unpacklo_epi16(p2, p3);
- // 40 50 41 51 42 52 43 53
- x2 = _mm_unpacklo_epi16(p4, p5);
- // 60 70 61 71 62 72 63 73
- x3 = _mm_unpacklo_epi16(p6, p7);
- // 00 10 20 30 01 11 21 31
- x4 = _mm_unpacklo_epi32(x0, x1);
- // 40 50 60 70 41 51 61 71
- x5 = _mm_unpacklo_epi32(x2, x3);
- // 00 10 20 30 40 50 60 70
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 01 11 21 31 41 51 61 71
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
- // 00 10 20 30 40 50 60 70
- _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
- // 01 11 21 31 41 51 61 71
-
- // 02 12 22 32 03 13 23 33
- x4 = _mm_unpackhi_epi32(x0, x1);
- // 42 52 62 72 43 53 63 73
- x5 = _mm_unpackhi_epi32(x2, x3);
- // 02 12 22 32 42 52 62 72
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
- // 02 12 22 32 42 52 62 72
- _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
- // 03 13 23 33 43 53 63 73
-
- // 04 14 05 15 06 16 07 17
- x0 = _mm_unpackhi_epi16(p0, p1);
- // 24 34 25 35 26 36 27 37
- x1 = _mm_unpackhi_epi16(p2, p3);
- // 44 54 45 55 46 56 47 57
- x2 = _mm_unpackhi_epi16(p4, p5);
- // 64 74 65 75 66 76 67 77
- x3 = _mm_unpackhi_epi16(p6, p7);
- // 04 14 24 34 05 15 25 35
- x4 = _mm_unpacklo_epi32(x0, x1);
- // 44 54 64 74 45 55 65 75
- x5 = _mm_unpacklo_epi32(x2, x3);
- // 04 14 24 34 44 54 64 74
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 05 15 25 35 45 55 65 75
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
- // 04 14 24 34 44 54 64 74
- _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
- // 05 15 25 35 45 55 65 75
-
- // 06 16 26 36 07 17 27 37
- x4 = _mm_unpackhi_epi32(x0, x1);
- // 46 56 66 76 47 57 67 77
- x5 = _mm_unpackhi_epi32(x2, x3);
- // 06 16 26 36 46 56 66 76
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
- // 06 16 26 36 46 56 66 76
- _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
- // 07 17 27 37 47 57 67 77
- } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
- uint16_t *out, int out_p) {
- uint16_t *src0[1];
- uint16_t *src1[1];
- uint16_t *dest0[1];
- uint16_t *dest1[1];
- src0[0] = in0;
- src1[0] = in1;
- dest0[0] = out;
- dest1[0] = out + 8;
- highbd_transpose(src0, in_p, dest0, out_p, 1);
- highbd_transpose(src1, in_p, dest1, out_p, 1);
-}
-
void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
@@ -1130,10 +992,12 @@ void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
- // Loop filtering
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+ highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd);
+#else
aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
thresh, bd);
-
+#endif
// Transpose back
highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
index 9c3bbdd69..855bc6558 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -293,4 +293,6 @@ HIGH_SADNXN4D 4, 16
HIGH_SADNXN4D 16, 4
HIGH_SADNXN4D 8, 32
HIGH_SADNXN4D 32, 8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
index 248b98ef5..760e68aab 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -158,7 +158,10 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-
+%if CONFIG_EXT_PARTITION_TYPES
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+%endif
; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
@@ -302,6 +305,8 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
%if CONFIG_EXT_PARTITION_TYPES
HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
%endif
; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
index 7bc8a0df3..befd81269 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -177,177 +177,94 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
_mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
}
-static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 3;
- src += src_stride << 3;
- pred += pred_stride << 3;
- subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += 8;
- src += 8;
- pred += 8;
- subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 3;
- src += src_stride << 3;
- pred += pred_stride << 3;
- subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 4;
- src += src_stride << 4;
- pred += pred_stride << 4;
- subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += 16;
- src += 16;
- pred += 16;
- subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 4;
- src += src_stride << 4;
- pred += pred_stride << 4;
- subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 5;
- src += src_stride << 5;
- pred += pred_stride << 5;
- subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += 32;
- src += 32;
- pred += 32;
- subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 5;
- src += src_stride << 5;
- pred += pred_stride << 5;
- subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 6;
- src += src_stride << 6;
- pred += pred_stride << 6;
- subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += 64;
- src += 64;
- pred += 64;
- subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
- diff += diff_stride << 6;
- src += src_stride << 6;
- pred += pred_stride << 6;
- subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
+#define STACK_V(h, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+ pred + pred_stride * h, pred_stride); \
+ } while (0)
+
+#define STACK_H(w, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+ } while (0)
+
+#define SUBTRACT_FUN(size) \
+ static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \
+ const uint16_t *src, ptrdiff_t src_stride, \
+ const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+#if CONFIG_EXT_PARTITION
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+#endif
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+#if CONFIG_EXT_PARTITION
+SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); }
+SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); }
+#endif
static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
- SubtractWxHFuncType ret_func_ptr = NULL;
if (rows == 4) {
- if (cols == 4) {
- ret_func_ptr = subtract_4x4;
- } else if (cols == 8) {
- ret_func_ptr = subtract_8x4;
- }
- } else if (rows == 8) {
- if (cols == 4) {
- ret_func_ptr = subtract_4x8;
- } else if (cols == 8) {
- ret_func_ptr = subtract_8x8;
- } else if (cols == 16) {
- ret_func_ptr = subtract_16x8;
- }
- } else if (rows == 16) {
- if (cols == 8) {
- ret_func_ptr = subtract_8x16;
- } else if (cols == 16) {
- ret_func_ptr = subtract_16x16;
- } else if (cols == 32) {
- ret_func_ptr = subtract_32x16;
- }
- } else if (rows == 32) {
- if (cols == 16) {
- ret_func_ptr = subtract_16x32;
- } else if (cols == 32) {
- ret_func_ptr = subtract_32x32;
- } else if (cols == 64) {
- ret_func_ptr = subtract_64x32;
- }
- } else if (rows == 64) {
- if (cols == 32) {
- ret_func_ptr = subtract_32x64;
- } else if (cols == 64) {
- ret_func_ptr = subtract_64x64;
- } else if (cols == 128) {
- ret_func_ptr = subtract_128x64;
- }
- } else if (rows == 128) {
- if (cols == 64) {
- ret_func_ptr = subtract_64x128;
- } else if (cols == 128) {
- ret_func_ptr = subtract_128x128;
- }
+ if (cols == 4) return subtract_4x4;
+ if (cols == 8) return subtract_8x4;
+ if (cols == 16) return subtract_16x4;
+ }
+ if (rows == 8) {
+ if (cols == 4) return subtract_4x8;
+ if (cols == 8) return subtract_8x8;
+ if (cols == 16) return subtract_16x8;
+ if (cols == 32) return subtract_32x8;
+ }
+ if (rows == 16) {
+ if (cols == 4) return subtract_4x16;
+ if (cols == 8) return subtract_8x16;
+ if (cols == 16) return subtract_16x16;
+ if (cols == 32) return subtract_32x16;
+ if (cols == 64) return subtract_64x16;
+ }
+ if (rows == 32) {
+ if (cols == 8) return subtract_8x32;
+ if (cols == 16) return subtract_16x32;
+ if (cols == 32) return subtract_32x32;
+ if (cols == 64) return subtract_64x32;
+#if CONFIG_EXT_PARTITION
+ if (cols == 128) return subtract_128x32;
+#endif // CONFIG_EXT_PARTITION
+ }
+ if (rows == 64) {
+ if (cols == 16) return subtract_16x64;
+ if (cols == 32) return subtract_32x64;
+ if (cols == 64) return subtract_64x64;
+#if CONFIG_EXT_PARTITION
+ if (cols == 128) return subtract_128x64;
+#endif // CONFIG_EXT_PARTITION
}
- if (!ret_func_ptr) {
- assert(0);
+#if CONFIG_EXT_PARTITION
+ if (rows == 128) {
+ if (cols == 32) return subtract_32x128;
+ if (cols == 64) return subtract_64x128;
+ if (cols == 128) return subtract_128x128;
}
- return ret_func_ptr;
+#endif // CONFIG_EXT_PARTITION
+ assert(0);
+ return NULL;
}
void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 93923ffb0..62acf3ed3 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -189,6 +189,8 @@ VAR_FN(8, 8, 8, 6);
VAR_FN(16, 4, 16, 6);
VAR_FN(8, 32, 8, 8);
VAR_FN(32, 8, 16, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
#endif
#undef VAR_FN
@@ -411,7 +413,9 @@ DECLS(sse2);
FN(8, 4, 8, 3, 2, opt, (int64_t)); \
FN(16, 4, 16, 4, 2, opt, (int64_t)); \
FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t))
+ FN(32, 8, 16, 5, 3, opt, (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+ FN(64, 16, 16, 6, 4, opt, (int64_t))
#else
#define FNS(opt) \
FN(64, 64, 16, 6, 6, opt, (int64_t)); \
@@ -588,7 +592,9 @@ DECLS(sse2);
FN(8, 4, 8, 3, 2, opt, (int64_t)); \
FN(16, 4, 16, 4, 2, opt, (int64_t)); \
FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t));
+ FN(32, 8, 16, 5, 3, opt, (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+ FN(64, 16, 16, 6, 4, opt, (int64_t));
#else
#define FNS(opt) \
FN(64, 64, 16, 6, 6, opt, (int64_t)); \
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 000000000..6b8922b8c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+ const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i y = _mm256_sad_epu8(x, zero);
+ __m256i u = _mm256_permute2x128_si256(y, y, 1);
+ y = _mm256_add_epi64(u, y);
+ u = _mm256_unpackhi_epi64(y, y);
+ return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ dst += stride;
+ }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_32(above);
+ __m256i sum_left = dc_sum_32(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm256_srai_epi16(sum_left, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(left);
+ (void)above;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m256i t[4];
+ __m256i m = _mm256_setzero_si256();
+ const __m256i inc = _mm256_set1_epi8(4);
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ t[i] = _mm256_shuffle_epi8(*row, m);
+ __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+ __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+ _mm256_storeu_si256((__m256i *)dst, r0);
+ _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+ dst += stride;
+ m = _mm256_add_epi8(m, inc);
+ }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+ __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+ __m256i v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ u = _mm256_unpackhi_epi8(left_col, left_col);
+
+ v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+
+// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
+// Use a header file, intrapred_common_x86.h
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+ __m128i x = _mm_load_si128((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_sad_epu8(x, zero);
+ const __m128i high = _mm_unpackhi_epi64(x, x);
+ return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i top_sum = dc_sum_32_sse2(above);
+ __m128i left_sum = dc_sum_16_sse2(left);
+ left_sum = _mm_add_epi16(top_sum, left_sum);
+ uint32_t sum = _mm_cvtsi128_si32(left_sum);
+ sum += 24;
+ sum /= 48;
+
+ const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i sum = dc_sum_16_sse2(left);
+ (void)above;
+
+ const __m128i eight = _mm_set1_epi16(8);
+ sum = _mm_add_epi16(sum, eight);
+ sum = _mm_srai_epi16(sum, 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i r = _mm_shuffle_epi8(sum, zero);
+ const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// TM_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i base =
+ _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+ __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+ __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+ __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+ __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+ mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+ __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+ pl = _mm256_andnot_si256(mask1, *left);
+
+ ptl = _mm256_and_si256(mask2, *topleft);
+ pt = _mm256_andnot_si256(mask2, *top);
+ pt = _mm256_or_si256(pt, ptl);
+ pt = _mm256_and_si256(mask1, pt);
+
+ return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i p0 = paeth_pred(left, top, topleft);
+ const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i p = _mm256_packus_epi16(p0, p1);
+ return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+ const __m128i x = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+ const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i x = _mm_loadl_epi64((const __m128i *)left);
+ const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16(0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+ const __m128i x = _mm_load_si128((const __m128i *)left);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16(0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16(0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16(0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+ const __m256i *top1,
+ const __m256i *topleft) {
+ __m256i p0 = paeth_pred(left, top0, topleft);
+ __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+ p0 = paeth_pred(left, top1, topleft);
+ p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+ return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16(0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+ _mm256_storeu_si256((__m256i *)dst, r);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16(0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16(0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm
index 02567db49..9aece27be 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm
@@ -623,149 +623,3 @@ cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
lea dstq, [dstq+strideq*4]
jnz .loop
REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
- pxor m1, m1
- movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
- punpcklbw m0, m1
- pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
- psrldq m0, 2
- psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
- movd m2, [leftq]
- punpcklbw m2, m1
- pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
- pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m4
- packuswb m3, m3
- movd [dstq ], m4
- movd [dstq+strideq], m3
- lea dstq, [dstq+strideq*2]
- pshuflw m4, m2, 0xaa
- pshuflw m3, m2, 0xff
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m4
- packuswb m3, m3
- movd [dstq ], m4
- movd [dstq+strideq], m3
- RET
-
-INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
- pxor m1, m1
- movd m2, [aboveq-1]
- movq m0, [aboveq]
- punpcklbw m2, m1
- punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
- pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -4
- punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
- psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
- movq m2, [leftq]
- punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
-.loop:
- pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
- pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
- punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
- punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m3
- movq [dstq ], m4
- movhps [dstq+strideq], m4
- lea dstq, [dstq+strideq*2]
- psrldq m2, 4
- inc lineq
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
- pxor m1, m1
- mova m2, [aboveq-16];
- mova m0, [aboveq] ; t1 t2 ... t16 [byte]
- punpckhbw m2, m1 ; [127:112] tl [word]
- punpckhbw m4, m0, m1
- punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
- DEFINE_ARGS dst, stride, line, left, stride8
- mov lineq, -8
- pshufhw m2, m2, 0xff
- mova m3, [leftq] ; l1 l2 ... l16 [byte]
- punpckhqdq m2, m2 ; tl repeated 8 times [word]
- psubw m0, m2
- psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
- punpckhbw m5, m3, m1
- punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
- lea stride8q, [strideq*8]
-.loop:
- pshuflw m6, m3, 0x0
- pshuflw m7, m5, 0x0
- punpcklqdq m6, m6 ; l1 repeated 8 times [word]
- punpcklqdq m7, m7 ; l8 repeated 8 times [word]
- paddw m1, m6, m0
- paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
- psrldq m5, 2
- packuswb m1, m6
- mova [dstq ], m1
- paddw m1, m7, m0
- paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
- psrldq m3, 2
- packuswb m1, m7
- mova [dstq+stride8q], m1
- inc lineq
- lea dstq, [dstq+strideq]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
- pxor m1, m1
- movd m2, [aboveq-1]
- mova m0, [aboveq]
- mova m4, [aboveq+16]
- punpcklbw m2, m1
- punpckhbw m3, m0, m1
- punpckhbw m5, m4, m1
- punpcklbw m0, m1
- punpcklbw m4, m1
- pshuflw m2, m2, 0x0
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -16
- punpcklqdq m2, m2
- add leftq, 32
- psubw m0, m2
- psubw m3, m2
- psubw m4, m2
- psubw m5, m2
-.loop:
- movd m2, [leftq+lineq*2]
- pxor m1, m1
- punpcklbw m2, m1
- pshuflw m7, m2, 0x55
- pshuflw m2, m2, 0x0
- punpcklqdq m2, m2
- punpcklqdq m7, m7
- paddw m6, m2, m3
- paddw m1, m2, m0
- packuswb m1, m6
- mova [dstq ], m1
- paddw m6, m2, m5
- paddw m1, m2, m4
- packuswb m1, m6
- mova [dstq+16 ], m1
- paddw m6, m7, m3
- paddw m1, m7, m0
- packuswb m1, m6
- mova [dstq+strideq ], m1
- paddw m6, m7, m5
- paddw m1, m7, m4
- packuswb m1, m6
- mova [dstq+strideq+16], m1
- lea dstq, [dstq+strideq*2]
- inc lineq
- jnz .loop
- REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 000000000..2a83b9001
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_storel_epi64((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ dst += stride;
+ }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_unpacklo_epi8(x, zero);
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_16(const uint8_t *ref) {
+ __m128i x = _mm_load_si128((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_sad_epu8(x, zero);
+ const __m128i high = _mm_unpackhi_epi64(x, x);
+ return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_4(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum /= 12;
+
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ const uint32_t pred = _mm_cvtsi128_si32(row);
+ dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_4(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum /= 12;
+
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum /= 24;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum /= 24;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32(left);
+ __m128i sum_above = dc_sum_16(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum /= 48;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i sum_left = dc_sum_16(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum /= 48;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_4(above);
+ const __m128i two = _mm_set1_epi16((int16_t)2);
+ sum_above = _mm_add_epi16(sum_above, two);
+ sum_above = _mm_srai_epi16(sum_above, 2);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+ const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+ dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16((uint16_t)4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16((uint16_t)4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i eight = _mm_set1_epi16((uint16_t)8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i eight = _mm_set1_epi16((uint16_t)8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16((uint16_t)4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+ const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+ dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_4(left);
+ const __m128i two = _mm_set1_epi16((uint16_t)2);
+ sum_left = _mm_add_epi16(sum_left, two);
+ sum_left = _mm_srai_epi16(sum_left, 2);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16(left);
+ const __m128i eight = _mm_set1_epi16((uint16_t)8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16((uint16_t)4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32(left);
+ const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16(left);
+ const __m128i eight = _mm_set1_epi16((uint16_t)8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const uint32_t pred = 0x80808080;
+ dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t pred = *(uint32_t *)above;
+ (void)left;
+ dc_store_4x8(pred, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ (void)left;
+ int i;
+ for (i = 0; i < 16; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+ left_col = _mm_unpackhi_epi64(left_col, left_col);
+ row0 = _mm_shufflelo_epi16(left_col, 0);
+ row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ dst += stride;
+ }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+ row[0] = _mm_unpacklo_epi64(u0, u0);
+ row[1] = _mm_unpacklo_epi64(u1, u1);
+ row[2] = _mm_unpacklo_epi64(u2, u2);
+ row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+ row[0] = _mm_unpackhi_epi64(u0, u0);
+ row[1] = _mm_unpackhi_epi64(u1, u1);
+ row[2] = _mm_unpackhi_epi64(u2, u2);
+ row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+ int i = 0;
+
+ do {
+ left_col = _mm_load_si128((const __m128i *)left);
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p, dst, stride);
+ dst += stride << 2;
+
+ left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p, dst, stride);
+ dst += stride << 2;
+
+ left += 16;
+ i++;
+ } while (i < 2);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ _mm_store_si128((__m128i *)(dst + 16), row[i]);
+ dst += stride;
+ }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+
+ left_col = _mm_load_si128((const __m128i *)left);
+
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+ dst += stride << 2;
+
+ left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 000000000..85b82744e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,885 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// TM_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+ const __m128i *topleft) {
+ const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+ __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+ __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+ __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+ __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+ mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+ __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+ pl = _mm_andnot_si128(mask1, *left);
+
+ ptl = _mm_and_si128(mask2, *topleft);
+ pt = _mm_andnot_si128(mask2, *top);
+ pt = _mm_or_si128(pt, ptl);
+ pt = _mm_and_si128(mask1, pt);
+
+ return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+ const __m128i *top1,
+ const __m128i *topleft) {
+ const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+ const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+ return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ rep = _mm_set1_epi16(0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ rep = _mm_set1_epi16(0x8000);
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i d = _mm_loadl_epi64((const __m128i *)above);
+ pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
+ pixels[1] = _mm_loadl_epi64((const __m128i *)left);
+
+ const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vecotr
+// weights[2]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+ __m128i *weights) {
+ __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+ const __m128i zero = _mm_setzero_si128();
+
+ weights[0] = _mm_unpacklo_epi8(t, zero);
+ const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+ weights[1] = _mm_sub_epi16(d, weights[0]);
+ weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
+
+ if (height == 8) {
+ t = _mm_srli_si128(t, 4);
+ weights[0] = _mm_unpacklo_epi8(t, zero);
+ weights[1] = _mm_sub_epi16(d, weights[0]);
+ }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
+ int h, uint8_t *dst, ptrdiff_t stride) {
+ const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set1_epi32(0xc080400);
+ __m128i rep = _mm_set1_epi16(0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ int i;
+ for (i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+ b = _mm_unpacklo_epi16(b, pixel[2]);
+ __m128i sum = _mm_madd_epi16(b, weight[2]);
+
+ sum = _mm_add_epi32(s, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
+
+ sum = _mm_shuffle_epi8(sum, gat);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 4, pixels);
+
+ __m128i weights[3];
+ load_weight_w4(sm_weight_arrays, 4, weights);
+
+ smooth_pred_4xh(pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 8, pixels);
+
+ __m128i weights[3];
+ load_weight_w4(sm_weight_arrays, 8, weights);
+
+ smooth_pred_4xh(pixels, weights, 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i d = _mm_loadl_epi64((const __m128i *)above);
+ pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+ pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+ __m128i *weight_h, __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ const int we_offset = height < 8 ? 4 : 8;
+ __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+
+ const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+ if (height == 4) {
+ we = _mm_srli_si128(we, 4);
+ __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+ __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+ weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+ weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(we, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+ const __m128i *ww, int h, uint8_t *dst,
+ ptrdiff_t stride, int second_half) {
+ const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ int i;
+ for (i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+ __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+ b = _mm_unpacklo_epi16(b, pixels[3]);
+ __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+ __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+ s0 = _mm_add_epi32(s0, sum0);
+ s0 = _mm_add_epi32(s0, round);
+ s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
+
+ s1 = _mm_add_epi32(s1, sum1);
+ s1 = _mm_add_epi32(s1, round);
+ s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
+
+ sum0 = _mm_packus_epi16(s0, s1);
+ sum0 = _mm_shuffle_epi8(sum0, gat);
+ _mm_storel_epi64((__m128i *)dst, sum0);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 4, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 8, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 16, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, 1/4
+// pixels[1]: above and below_pred interleave vector, 2/4
+// pixels[2]: above and below_pred interleave vector, 3/4
+// pixels[3]: above and below_pred interleave vector, 3/4
+// pixels[4]: left vector
+// pixels[5]: left vector, h = 32 only
+// pixels[6]: right_pred vector
+static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i ab = _mm_load_si128((const __m128i *)above);
+ pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
+ pixels[4] = _mm_load_si128((const __m128i *)left);
+ pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
+ const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x = _mm_unpacklo_epi8(ab, zero);
+ pixels[0] = _mm_unpacklo_epi16(x, bp);
+ pixels[1] = _mm_unpackhi_epi16(x, bp);
+
+ x = _mm_unpackhi_epi8(ab, zero);
+ pixels[2] = _mm_unpacklo_epi16(x, bp);
+ pixels[3] = _mm_unpackhi_epi16(x, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// ... ...
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+// ... ...
+static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
+ __m128i *weight_h, __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+ __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+ __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
+ __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+ const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+ if (height == 8) {
+ weight_h[0] = _mm_unpacklo_epi8(w8, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h
+
+ __m128i x = _mm_unpacklo_epi8(w16, zero);
+ __m128i y = _mm_sub_epi16(d, x);
+ weight_w[0] = _mm_unpacklo_epi16(x, y);
+ weight_w[1] = _mm_unpackhi_epi16(x, y);
+ x = _mm_unpackhi_epi8(w16, zero);
+ y = _mm_sub_epi16(d, x);
+ weight_w[2] = _mm_unpacklo_epi16(x, y);
+ weight_w[3] = _mm_unpackhi_epi16(x, y);
+ }
+
+ if (height == 16) {
+ weight_h[0] = _mm_unpacklo_epi8(w16, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(w16, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
+ weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+ }
+
+ if (height == 32) {
+ weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+
+ __m128i x = _mm_unpacklo_epi8(w16, zero);
+ __m128i y = _mm_sub_epi16(d, x);
+ weight_w[0] = _mm_unpacklo_epi16(x, y);
+ weight_w[1] = _mm_unpackhi_epi16(x, y);
+ x = _mm_unpackhi_epi8(w16, zero);
+ y = _mm_sub_epi16(d, x);
+ weight_w[2] = _mm_unpacklo_epi16(x, y);
+ weight_w[3] = _mm_unpackhi_epi16(x, y);
+
+ weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
+ weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
+ weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+ }
+}
+
+static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
+ const __m128i *ww, uint8_t *dst,
+ ptrdiff_t stride, int quarter) {
+ __m128i d = _mm_set1_epi16(0x100);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+ const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+ __m128i rep =
+ (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
+ const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+ __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+ __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
+ __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(left, rep);
+ b = _mm_unpacklo_epi16(b, pixels[6]);
+ __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+ __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+ __m128i sum2 = _mm_madd_epi16(b, ww[2]);
+ __m128i sum3 = _mm_madd_epi16(b, ww[3]);
+
+ s0 = _mm_add_epi32(s0, sum0);
+ s0 = _mm_add_epi32(s0, round);
+ s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
+
+ s1 = _mm_add_epi32(s1, sum1);
+ s1 = _mm_add_epi32(s1, round);
+ s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
+
+ s2 = _mm_add_epi32(s2, sum2);
+ s2 = _mm_add_epi32(s2, round);
+ s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
+
+ s3 = _mm_add_epi32(s3, sum3);
+ s3 = _mm_add_epi32(s3, round);
+ s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
+
+ sum0 = _mm_packus_epi16(s0, s1);
+ sum0 = _mm_shuffle_epi8(sum0, gat);
+ sum1 = _mm_packus_epi16(s2, s3);
+ sum1 = _mm_shuffle_epi8(sum1, gat);
+
+ _mm_storel_epi64((__m128i *)dst, sum0);
+ _mm_storel_epi64((__m128i *)(dst + 8), sum1);
+
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[7];
+ load_pixel_w16(above, left, 8, pixels);
+
+ __m128i wh[2], ww[4];
+ load_weight_w16(sm_weight_arrays, 8, wh, ww);
+
+ smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[7];
+ load_pixel_w16(above, left, 16, pixels);
+
+ __m128i wh[4], ww[4];
+ load_weight_w16(sm_weight_arrays, 16, wh, ww);
+
+ smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[7];
+ load_pixel_w16(above, left, 32, pixels);
+
+ __m128i wh[8], ww[4];
+ load_weight_w16(sm_weight_arrays, 32, wh, ww);
+
+ smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
+ dst += stride << 3;
+ smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
+}
+
+static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i ab0 = _mm_load_si128((const __m128i *)above);
+ __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16));
+
+ pixels[10] = _mm_set1_epi16((uint16_t)above[31]);
+ pixels[8] = _mm_load_si128((const __m128i *)left);
+ pixels[9] = _mm_load_si128((const __m128i *)(left + 16));
+
+ const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x = _mm_unpacklo_epi8(ab0, zero);
+ pixels[0] = _mm_unpacklo_epi16(x, bp);
+ pixels[1] = _mm_unpackhi_epi16(x, bp);
+
+ x = _mm_unpackhi_epi8(ab0, zero);
+ pixels[2] = _mm_unpacklo_epi16(x, bp);
+ pixels[3] = _mm_unpackhi_epi16(x, bp);
+
+ x = _mm_unpacklo_epi8(ab1, zero);
+ pixels[4] = _mm_unpacklo_epi16(x, bp);
+ pixels[5] = _mm_unpackhi_epi16(x, bp);
+
+ x = _mm_unpackhi_epi8(ab1, zero);
+ pixels[6] = _mm_unpacklo_epi16(x, bp);
+ pixels[7] = _mm_unpackhi_epi16(x, bp);
+}
+
+static INLINE void load_weight_w32(const uint8_t *weight_array, int height,
+ __m128i *weight_h, __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+ __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
+ __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+ const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+ if (height == 16) {
+ weight_h[0] = _mm_unpacklo_epi8(w16, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(w16, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+
+ __m128i x = _mm_unpacklo_epi8(w32_0, zero);
+ __m128i y = _mm_sub_epi16(d, x);
+ weight_w[0] = _mm_unpacklo_epi16(x, y);
+ weight_w[1] = _mm_unpackhi_epi16(x, y);
+
+ x = _mm_unpackhi_epi8(w32_0, zero);
+ y = _mm_sub_epi16(d, x);
+ weight_w[2] = _mm_unpacklo_epi16(x, y);
+ weight_w[3] = _mm_unpackhi_epi16(x, y);
+
+ x = _mm_unpacklo_epi8(w32_1, zero);
+ y = _mm_sub_epi16(d, x);
+ weight_w[4] = _mm_unpacklo_epi16(x, y);
+ weight_w[5] = _mm_unpackhi_epi16(x, y);
+
+ x = _mm_unpackhi_epi8(w32_1, zero);
+ y = _mm_sub_epi16(d, x);
+ weight_w[6] = _mm_unpacklo_epi16(x, y);
+ weight_w[7] = _mm_unpackhi_epi16(x, y);
+ }
+
+ if (height == 32) {
+ weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+
+ weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
+ weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
+ weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
+ weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+
+ weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]);
+ weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]);
+ weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]);
+ weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]);
+ }
+}
+
+static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh,
+ const __m128i *ww, uint8_t *dst,
+ ptrdiff_t stride, int quarter) {
+ __m128i d = _mm_set1_epi16(0x100);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+ const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+ __m128i rep =
+ (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
+ const __m128i left = (quarter < 2) ? pixels[8] : pixels[9];
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+
+ int j;
+ __m128i s[8];
+ __m128i b = _mm_shuffle_epi8(left, rep);
+ b = _mm_unpacklo_epi16(b, pixels[10]);
+
+ for (j = 0; j < 8; ++j) {
+ s[j] = _mm_madd_epi16(pixels[j], wh_sc);
+ s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j]));
+ s[j] = _mm_add_epi32(s[j], round);
+ s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale);
+ }
+
+ for (j = 0; j < 8; j += 2) {
+ __m128i sum = _mm_packus_epi16(s[j], s[j + 1]);
+ sum = _mm_shuffle_epi8(sum, gat);
+ _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum);
+ }
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[11];
+ load_pixel_w32(above, left, 16, pixels);
+
+ __m128i wh[4], ww[8];
+ load_weight_w32(sm_weight_arrays, 16, wh, ww);
+
+ smooth_pred_32x8(pixels, wh, ww, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[11];
+ load_pixel_w32(above, left, 32, pixels);
+
+ __m128i wh[8], ww[8];
+ load_weight_w32(sm_weight_arrays, 32, wh, ww);
+
+ smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2);
+ dst += stride << 3;
+ smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3);
+}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
index 4238e651b..26c5cfe59 100644
--- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
@@ -18,17 +18,17 @@
#include "aom_dsp/x86/txfm_common_avx2.h"
static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-#if CONFIG_HIGHBITDEPTH
- *in = _mm256_setr_epi16(
- (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
- (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
- (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
- (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
- (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
- (int16_t)coeff[15]);
-#else
- *in = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ *in = _mm256_setr_epi16(
+ (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+ (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+ (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+ (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+ (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+ (int16_t)coeff[15]);
+ } else {
+ *in = _mm256_loadu_si256((const __m256i *)coeff);
+ }
}
static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
index 95d246c3c..342816977 100644
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
@@ -133,12 +133,12 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_HIGHBITDEPTH
- return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
- data[6], data[7]);
-#else
- return _mm_load_si128((const __m128i *)data);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+ data[6], data[7]);
+ } else {
+ return _mm_load_si128((const __m128i *)data);
+ }
}
static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index 7e134dc63..8343dbbed 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -178,10 +178,20 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
#endif // !CONFIG_PARALLEL_DEBLOCKING
FILTER4;
+#if CONFIG_PARALLEL_DEBLOCKING
+ *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
+ ps1ps0 = _mm_srli_si128(ps1ps0, 8);
+ *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
+
+ *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
+ qs1qs0 = _mm_srli_si128(qs1qs0, 8);
+ *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
+#else
_mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
_mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
_mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
_mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
+#endif
}
void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
@@ -267,8 +277,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
// 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+#if !CONFIG_PARALLEL_DEBLOCKING
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+#endif
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
@@ -279,7 +291,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
*(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-
+#if !CONFIG_PARALLEL_DEBLOCKING
*(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
@@ -287,6 +299,19 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
*(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+#endif
+}
+
+static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
+ uint8_t *s) {
+#if CONFIG_PARALLEL_DEBLOCKING
+ *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
+ const __m128i hi = _mm_srli_si128(*x, 8);
+ *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
+#else
+ _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
+ _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
+#endif
}
void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
@@ -580,44 +605,37 @@ void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
q6p6 = _mm_andnot_si128(flat2, q6p6);
flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+ store_buffer_horz_8(&q6p6, p, 6, s);
q5p5 = _mm_andnot_si128(flat2, q5p5);
flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+ store_buffer_horz_8(&q5p5, p, 5, s);
q4p4 = _mm_andnot_si128(flat2, q4p4);
flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+ store_buffer_horz_8(&q4p4, p, 4, s);
q3p3 = _mm_andnot_si128(flat2, q3p3);
flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+ store_buffer_horz_8(&q3p3, p, 3, s);
q2p2 = _mm_andnot_si128(flat2, q2p2);
flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+ store_buffer_horz_8(&q2p2, p, 2, s);
q1p1 = _mm_andnot_si128(flat2, q1p1);
flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+ store_buffer_horz_8(&q1p1, p, 1, s);
q0p0 = _mm_andnot_si128(flat2, q0p0);
flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ store_buffer_horz_8(&q0p0, p, 0, s);
}
}
@@ -651,10 +669,33 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
-void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
+
+static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
+ int p, int offset, uint8_t *s) {
+ int i;
+ if (pixel_num == FOUR_PIXELS) {
+ for (i = 13; i >= 0; i--) {
+ *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
+ }
+ }
+ if (pixel_num == EIGHT_PIXELS) {
+ for (i = 13; i >= 0; i--) {
+ _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
+ }
+ }
+ if (pixel_num == SIXTEEN_PIXELS) {
+ for (i = 13; i >= 0; i--) {
+ _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
+ }
+ }
+}
+
+static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
+ unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -910,73 +951,62 @@ void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
- p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+ __m128i x[14];
+ x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
- p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+ x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
- p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+ x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
- p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+ x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
- op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+ x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
- op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
- op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
- oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
- oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
- oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
- q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+ x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
- q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+ x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
- q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+ x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
- q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+
+ store_buffer_horz_16(pixel_num, x, p, 6, s);
}
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1186,15 +1216,35 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2);
+#if CONFIG_PARALLEL_DEBLOCKING
+ *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
+ *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
+ *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
+ *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
+ *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
+ *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
+#else
_mm_storel_epi64((__m128i *)(s - 3 * p), p2);
_mm_storel_epi64((__m128i *)(s - 2 * p), p1);
_mm_storel_epi64((__m128i *)(s - 1 * p), p0);
_mm_storel_epi64((__m128i *)(s + 0 * p), q0);
_mm_storel_epi64((__m128i *)(s + 1 * p), q1);
_mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+#endif
}
}
+void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING
+ lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
+#else
+ lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
+#endif
+}
+
void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
const uint8_t *_limit0,
const uint8_t *_thresh0,
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 000000000..027c890dc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H
+#define _AOM_DSP_X86_LPF_COMMON_X86_H
+
+#include <emmintrin.h> // SSE2
+
+#include "./aom_config.h"
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+ int out_p, int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ uint16_t *in = src[idx8x8];
+ uint16_t *out = dst[idx8x8];
+
+ p0 =
+ _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ p1 =
+ _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ p2 =
+ _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ p3 =
+ _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ p4 =
+ _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ p5 =
+ _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ p6 =
+ _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ p7 =
+ _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13
+ x0 = _mm_unpacklo_epi16(p0, p1);
+ // 20 30 21 31 22 32 23 33
+ x1 = _mm_unpacklo_epi16(p2, p3);
+ // 40 50 41 51 42 52 43 53
+ x2 = _mm_unpacklo_epi16(p4, p5);
+ // 60 70 61 71 62 72 63 73
+ x3 = _mm_unpacklo_epi16(p6, p7);
+ // 00 10 20 30 01 11 21 31
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 40 50 60 70 41 51 61 71
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 00 10 20 30 40 50 60 70
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 01 11 21 31 41 51 61 71
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
+ // 00 10 20 30 40 50 60 70
+ _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
+ // 01 11 21 31 41 51 61 71
+
+ // 02 12 22 32 03 13 23 33
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 42 52 62 72 43 53 63 73
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 02 12 22 32 42 52 62 72
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
+ // 02 12 22 32 42 52 62 72
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
+ // 03 13 23 33 43 53 63 73
+
+ // 04 14 05 15 06 16 07 17
+ x0 = _mm_unpackhi_epi16(p0, p1);
+ // 24 34 25 35 26 36 27 37
+ x1 = _mm_unpackhi_epi16(p2, p3);
+ // 44 54 45 55 46 56 47 57
+ x2 = _mm_unpackhi_epi16(p4, p5);
+ // 64 74 65 75 66 76 67 77
+ x3 = _mm_unpackhi_epi16(p6, p7);
+ // 04 14 24 34 05 15 25 35
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 44 54 64 74 45 55 65 75
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 04 14 24 34 44 54 64 74
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 05 15 25 35 45 55 65 75
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
+ // 04 14 24 34 44 54 64 74
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
+ // 05 15 25 35 45 55 65 75
+
+ // 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 06 16 26 36 46 56 66 76
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
+ // 06 16 26 36 46 56 66 76
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
+ // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+ uint16_t *out, int out_p) {
+ uint16_t *src0[1];
+ uint16_t *src1[1];
+ uint16_t *dest0[1];
+ uint16_t *dest1[1];
+ src0[0] = in0;
+ src1[0] = in1;
+ dest0[0] = out;
+ dest1[0] = out + 8;
+ highbd_transpose(src0, in_p, dest0, out_p, 1);
+ highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+#endif // _AOM_DSP_X86_LPF_COMMON_X86_H
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 6a73ac460..2536f91d2 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -98,7 +98,13 @@ MASKSAD4XN_SSSE3(16)
MASKSADMXN_SSSE3(16, 4)
MASKSAD8XN_SSSE3(32)
MASKSADMXN_SSSE3(32, 8)
-#endif
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(32, 128)
+MASKSADMXN_SSSE3(128, 32)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_EXT_PARTITION_TYPES
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
int src_stride,
@@ -294,7 +300,13 @@ HIGHBD_MASKSAD4XN_SSSE3(16)
HIGHBD_MASKSADMXN_SSSE3(16, 4)
HIGHBD_MASKSADMXN_SSSE3(8, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 8)
-#endif
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(32, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 32)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_EXT_PARTITION_TYPES
static INLINE unsigned int highbd_masked_sad_ssse3(
const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 24e7ed1c6..3ffe132be 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -131,7 +131,13 @@ MASK_SUBPIX_VAR4XH_SSSE3(16)
MASK_SUBPIX_VAR_SSSE3(16, 4)
MASK_SUBPIX_VAR8XH_SSSE3(32)
MASK_SUBPIX_VAR_SSSE3(32, 8)
-#endif
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_SSSE3(128, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 128)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_EXT_PARTITION_TYPES
static INLINE __m128i filter_block(const __m128i a, const __m128i b,
const __m128i filter) {
@@ -712,6 +718,12 @@ HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32)
+#endif
#endif
static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
index 3fd6f71e5..52dd508ec 100644
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -142,6 +142,8 @@ OBMCSADWXH(4, 16)
OBMCSADWXH(16, 4)
OBMCSADWXH(8, 32)
OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
#endif
////////////////////////////////////////////////////////////////////////////////
@@ -271,5 +273,7 @@ HBD_OBMCSADWXH(4, 16)
HBD_OBMCSADWXH(16, 4)
HBD_OBMCSADWXH(8, 32)
HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
#endif
#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 44cfa8e28..392616af3 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -151,7 +151,13 @@ OBMCVARWXH(4, 16)
OBMCVARWXH(16, 4)
OBMCVARWXH(8, 32)
OBMCVARWXH(32, 8)
-#endif
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+#if CONFIG_EXT_PARTITION
+OBMCVARWXH(32, 128)
+OBMCVARWXH(128, 32)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_EXT_PARTITION_TYPES
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
@@ -364,5 +370,11 @@ HBD_OBMCVARWXH(4, 16)
HBD_OBMCVARWXH(16, 4)
HBD_OBMCVARWXH(8, 32)
HBD_OBMCVARWXH(32, 8)
-#endif
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
+#if CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(32, 128)
+HBD_OBMCVARWXH(128, 32)
+#endif // CONFIG_EXT_PARTITION
+#endif // CONFIG_EXT_PARTITION_TYPES
#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 890c1f01e..0e7f679d0 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -16,29 +16,29 @@
#include "aom/aom_integer.h"
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
- return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+ } else {
+ return _mm_load_si128((const __m128i *)coeff_ptr);
+ }
}
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+ } else {
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
+ }
}
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
index 4570e2ce6..2c67f450f 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -256,4 +256,6 @@ SADNXN4D 4, 16
SADNXN4D 16, 4
SADNXN4D 8, 32
SADNXN4D 32, 8
+SADNXN4D 16, 64
+SADNXN4D 64, 16
%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
index 88d427077..b4cc6abf1 100644
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -163,6 +163,10 @@ SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
+%if CONFIG_EXT_PARTITION_TYPES
+SAD64XN 16 ; sad64x16_sse2
+SAD64XN 16, 1 ; sad64x16_avg_sse2
+%endif
; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
@@ -261,6 +265,8 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2
%if CONFIG_EXT_PARTITION_TYPES
SAD16XN 4 ; sad_16x4_sse2
SAD16XN 4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64 ; sad_16x64_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
%endif
; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
index 4f7a60c22..1a8fed710 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -15,6 +15,7 @@
#include <immintrin.h>
#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/common_avx2.h"
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
@@ -34,135 +35,6 @@ static INLINE void mm256_reverse_epi16(__m256i *u) {
*u = _mm256_permute2x128_si256(v, v, 1);
}
-// Note: in and out could have the same value
-static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
- __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
- __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
- __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
- __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
- __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
- __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
- __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
- __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
- __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
- __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
- __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
- __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
- __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
- __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
- __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
- __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
- // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
- // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
- // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
- // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
- // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
- // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
- // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
- // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
-
- // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
- // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
- // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
- // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
- // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
- // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
- // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
- // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
-
- __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
- __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
- __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
- __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
- __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
- __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
- __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
- __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
- __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
- __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
- __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
- __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
- __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
- __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
- __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
- __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
- // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
- // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
- // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
- // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
- // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
- // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
- // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
- // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
-
- // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
- // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
- // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
- // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
- // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
- // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
- // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
- // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
-
- tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
- tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
- tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
- tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
- tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
- tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
- tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
- tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
- tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
- tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
- tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
- tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
- tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
- tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
- tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
- tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
- // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
- // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
- // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
- // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
- // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
- // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
- // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
- // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
-
- // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
- // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
- // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
- // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
- // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
- // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
- // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
- // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
-
- out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
- out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
- out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
- out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
- out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
- out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
- out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
- out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
- out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
- out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
- out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
- out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
- out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
- out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
- out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
- out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-
static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
const __m256i *cospi) {
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
index e4ac56339..4e6eecd32 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
@@ -16,16 +16,16 @@
// This header file should be put below any x86 intrinsics head file
static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_storeu_si128((__m128i *)(dst_ptr), out0);
- _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
- _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif // CONFIG_HIGHBITDEPTH
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+ } else {
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+ }
}
#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 918844185..211fad3f8 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -382,6 +382,28 @@ unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride,
assert(sum >= -255 * 32 * 8);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
}
+
+unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 16 * 64);
+ assert(sum >= -255 * 16 * 64);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum,
+ aom_get16x16var_sse2, 16);
+ assert(sum <= 255 * 64 * 16);
+ assert(sum >= -255 * 64 * 16);
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
#endif
// The 2 unused parameters are place holders for PIC enabled build.
@@ -451,7 +473,9 @@ DECLS(ssse3);
FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \
- FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t))
+ FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \
+ FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
#else
#define FNS(opt) \
FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
@@ -543,7 +567,9 @@ DECLS(ssse3);
FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \
- FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t))
+ FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \
+ FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
#else
#define FNS(opt) \
FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \