diff options
author | trav90 <travawine@palemoon.org> | 2018-10-18 21:53:44 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-18 21:53:44 -0500 |
commit | ec910d81405c736a4490383a250299a7837c2e64 (patch) | |
tree | 4f27cc226f93a863121aef6c56313e4153a69b3e /third_party/aom/aom_dsp/x86 | |
parent | 01eb57073ba97b2d6cbf20f745dfcc508197adc3 (diff) | |
download | UXP-ec910d81405c736a4490383a250299a7837c2e64.tar UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.gz UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.lz UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.xz UXP-ec910d81405c736a4490383a250299a7837c2e64.zip |
Update aom to commit id e87fb2378f01103d5d6e477a4ef6892dc714e614
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
32 files changed, 5938 insertions, 1337 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 357f37401..8688fb544 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -346,9 +346,15 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ psraw m0, 7 psraw m4, 7 %ifidn %1, h8_add_src +%if ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif movu m5, [srcq] mova m7, m5 - pand m5, [even_byte_mask] + pand m5, m2 psrlw m7, 8 paddsw m0, m5 paddsw m4, m7 diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h new file mode 100644 index 000000000..5f9596a74 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_COMMON_AVX2_H +#define AOM_DSP_X86_COMMON_AVX2_H + +#include <immintrin.h> + +#include "./aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h index d3aceae00..86df4a6f6 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h @@ -15,21 +15,21 @@ #include "./aom_config.h" static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { -#if CONFIG_HIGHBITDEPTH - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + if (sizeof(tran_low_t) == 4) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); -#else - _mm256_storeu_si256((__m256i *)out, *coeff); -#endif + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); + } else { + _mm256_storeu_si256((__m256i *)out, *coeff); + } } #endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 26b2db2e0..58e8971dd 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32( } static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_store_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_store_si128((__m128i *)(dst_ptr), *poutput); + } } static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c new file mode 100644 index 000000000..41b55c985 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// D45E_PRED +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y, + const __m256i *z) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a = _mm256_avg_epu16(*x, *z); + const __m256i b = + _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one)); + return _mm256_avg_epu16(b, *y); +} + +static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1, + const __m256i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m256i y = avg3_epu16(a0, a1, a2); + _mm256_storeu_si256((__m256i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 9)); + x0 = _mm256_insert_epi16(x0, above[23], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + x2 = _mm256_insert_epi16(x2, above[31], 15); + const __m256i y = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + x0 = _mm256_insert_epi16(x0, above[47], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i u = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst1, u); + + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17)); + y2 = _mm256_insert_epi16(y2, above[47], 15); + u = avg3_epu16(&y0, &y1, &y2); + _mm256_storeu_si256((__m256i *)dst2, u); +} + +void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + __m256i u = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst1, u); + + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33)); + y0 = _mm256_insert_epi16(y0, above[63], 15); + u = avg3_epu16(&y1, &y2, &y0); + _mm256_storeu_si256((__m256i *)dst2, u); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm index 5d84ef8a7..91b3d126c 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm @@ -257,200 +257,3 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above dec nlines4d jnz .loop REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps - movd m1, [aboveq-2] - movq m0, [aboveq] - pshuflw m1, m1, 0x0 - movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 - movlhps m1, m1 ; tl tl tl tl tl tl tl tl - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl - psllw m3, m4 - pcmpeqw m2, m2 - pxor m4, m4 ; min possible value - pxor m3, m2 ; max possible value - mova m1, [leftq] - pshuflw m2, m1, 0x0 - pshuflw m5, m1, 0x55 - movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - pshuflw m2, m1, 0xaa - pshuflw m5, m1, 0xff - movlhps m2, m5 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one - movd m1, [aboveq-2] - mova m0, [aboveq] - pshuflw m1, m1, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m3, m3 - pxor m4, m4 - pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 - pshuflw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m3, m3 - mov lineq, -4 - mova m2, m3 - punpcklqdq m1, m1 - psllw m3, m4 - add leftq, 16 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movd m1, [leftq+lineq*4] - movd m2, [leftq+lineq*4+2] - pshuflw m1, m1, 0x0 - pshuflw m2, m2, 0x0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - paddw m1, m0 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m1, m3 - pminsw m2, m3 - pmaxsw m1, m4 - pmaxsw m2, m4 - ;Store the values - mova [dstq ], m1 - mova [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps - movd m2, [aboveq-2] - mova m0, [aboveq] - mova m1, [aboveq+16] - pshuflw m2, m2, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - punpcklqdq m2, m2 - psllw m3, m4 - pcmpeqw m5, m5 - pxor m4, m4 ; min possible value - pxor m3, m5 ; max possible value - DEFINE_ARGS dst, stride, line, left - mov lineq, -8 - psubw m0, m2 - psubw m1, m2 -.loop: - movd m7, [leftq] - pshuflw m5, m7, 0x0 - pshuflw m2, m7, 0x55 - punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 - punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 - paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 - pminsw m6, m3 - pminsw m5, m3 - pmaxsw m6, m4 ; Clamp to the bit-depth - pmaxsw m5, m4 - mova [dstq ], m6 - mova [dstq +16], m5 - paddw m6, m2, m0 - paddw m2, m1 - pminsw m6, m3 - pminsw m2, m3 - pmaxsw m6, m4 - pmaxsw m2, m4 - mova [dstq+strideq*2 ], m6 - mova [dstq+strideq*2+16], m2 - lea dstq, [dstq+strideq*4] - inc lineq - lea leftq, [leftq+4] - - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps - movd m0, [aboveq-2] - mova m1, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - pshuflw m0, m0, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m5, m5 - movd m6, bpsd - psllw m5, m6 - pcmpeqw m7, m7 - pxor m6, m6 ; min possible value - pxor m5, m7 ; max possible value - punpcklqdq m0, m0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - psubw m1, m0 - psubw m2, m0 - psubw m3, m0 - psubw m4, m0 -.loop: - movd m7, [leftq] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +48], m0 - movd m7, [leftq+2] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2 ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+48], m0 - lea dstq, [dstq+strideq*4] - lea leftq, [leftq+4] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 000000000..691e166cf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,1256 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); +} + +void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i h76543210 = _mm_load_si128((const __m128i *)above); + __m128i hx7654321 = _mm_srli_si128(h76543210, 2); + __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); + __m128i hx8765432 = _mm_srli_si128(h87654321, 2); + __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); + __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); + dst += stride; + + // hcba98765 + h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); + h76543210 = _mm_insert_epi16(h76543210, above[11], 7); + // hxcba9876 + hx7654321 = _mm_srli_si128(h76543210, 2); + // hxxcba987 + hx8765432 = _mm_srli_si128(h76543210, 4); + avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); +} + +void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + __m128i y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x0 = _mm_loadu_si128((const __m128i *)(above + 3)); + y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 4)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 5)); + x2 = _mm_insert_epi16(x2, above[11], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} + +static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, + const __m128i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m128i y = avg3_epu16(a0, a1, a2); + _mm_storeu_si128((__m128i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm_loadu_si128((const __m128i *)(above + 9)); + x0 = _mm_insert_epi16(x0, above[15], 7); + const __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); +} + +void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm_loadu_si128((const __m128i *)(above + 15)); + __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 16)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 17)); + x2 = _mm_insert_epi16(x2, above[23], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c new file mode 100644 index 000000000..b089a3f43 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 +}; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 000000000..94c68885c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m256i *blt, + __m256i *lt, __m256i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *blt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *lt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *thr = _mm256_slli_epi16(y, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m256i *p, __m256i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); + q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); + } +} + +static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, + const __m256i *t, __m256i *hev) { + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); + __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); + h = _mm256_subs_epu16(h, *t); + + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + const __m256i zero = _mm256_setzero_si256(); + *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); +} + +static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, + const __m256i *l, const __m256i *bl, + __m256i *mask) { + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); + max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); + } + max = _mm256_subs_epu16(max, *l); + *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, + const __m256i *q, int bd, int start, + int end, __m256i *flat) { + __m256i max = _mm256_setzero_si256(); + int i; + for (i = start; i < end; ++i) { + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); + } + + __m256i ft; + if (bd == 8) + ft = _mm256_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); + + const __m256i zero = _mm256_setzero_si256(); + *flat = _mm256_cmpeq_epi16(ft, zero); +} + +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, + __m256i *pixel) { + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(*pixel, *max); + clamped = _mm256_andnot_si256(mask, *pixel); + mask = _mm256_and_si256(mask, *max); + clamped = _mm256_or_si256(mask, clamped); + + mask = _mm256_cmpgt_epi16(clamped, *min); + clamped = _mm256_and_si256(mask, clamped); + mask = _mm256_andnot_si256(mask, *min); + *pixel = _mm256_or_si256(clamped, mask); +} + +static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, + const __m256i *th, int bd, __m256i *ps, + __m256i *qs) { + __m256i t80; + if (bd == 8) + t80 = _mm256_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm256_set1_epi16(0x200); + else // bd == 12 + t80 = _mm256_set1_epi16(0x800); + + __m256i ps0 = _mm256_subs_epi16(p[0], t80); + __m256i ps1 = _mm256_subs_epi16(p[1], t80); + __m256i qs0 = _mm256_subs_epi16(q[0], t80); + __m256i qs1 = _mm256_subs_epi16(q[1], t80); + + const __m256i one = _mm256_set1_epi16(1); + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i zero = _mm256_setzero_si256(); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filter = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + __m256i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm256_and_si256(filter, hev); + + const __m256i x = _mm256_subs_epi16(qs0, ps0); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm256_and_si256(filter, *mask); + + const __m256i t3 = _mm256_set1_epi16(3); + const __m256i t4 = _mm256_set1_epi16(4); + + __m256i filter1 = _mm256_adds_epi16(filter, t4); + __m256i filter2 = _mm256_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm256_srai_epi16(filter1, 3); + filter2 = _mm256_srai_epi16(filter2, 3); + + qs0 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + + qs[0] = _mm256_adds_epi16(qs0, t80); + ps[0] = _mm256_adds_epi16(ps0, t80); + + filter = _mm256_adds_epi16(filter1, one); + filter = _mm256_srai_epi16(filter, 1); + filter = _mm256_andnot_si256(hev, filter); + + qs1 = _mm256_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm256_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + + qs[1] = _mm256_adds_epi16(qs1, t80); + ps[1] = _mm256_adds_epi16(ps1, t80); +} +#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 + +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} +#else +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + __m256i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); + + __m256i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); + + __m256i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m256i flat, flat2; + const __m256i one = _mm256_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm256_and_si256(flat, mask); + flat2 = _mm256_and_si256(flat2, flat); + + __m256i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); + + // flat and wide flat calculations + __m256i flat_p[3], flat_q[3]; + __m256i flat2_p[7], flat2_q[7]; + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + + __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), + _mm256_add_epi16(p[4], p[3])); + __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), + _mm256_add_epi16(q[4], q[3])); + + __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); + sum_p = _mm256_add_epi16(sum_p, sum_lp); + + __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); + sum_q = _mm256_add_epi16(sum_q, sum_lq); + sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); + sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); + flat2_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); + flat_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); + flat_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); + + __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); + __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); + __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); + __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); + + sum_q = _mm256_sub_epi16(sum_p, p[6]); + sum_p = _mm256_sub_epi16(sum_p, q[6]); + flat2_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm256_sub_epi16(sum_lp, p[2]); + sum_lp = _mm256_sub_epi16(sum_lp, q[2]); + flat_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p3 = _mm256_add_epi16(sum_p3, p[3]); + sum_q3 = _mm256_add_epi16(sum_q3, q[3]); + + sum_p = _mm256_sub_epi16(sum_p, q[5]); + sum_q = _mm256_sub_epi16(sum_q, p[5]); + flat2_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm256_sub_epi16(sum_lp, q[1]); + sum_lq = _mm256_sub_epi16(sum_lq, p[1]); + flat_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm256_andnot_si256(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm256_and_si256(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm256_andnot_si256(flat, q[2]); + flat_q[2] = _mm256_and_si256(flat, flat_q[2]); + q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm256_andnot_si256(flat, ps[i]); + flat_p[i] = _mm256_and_si256(flat, flat_p[i]); + p[i] = _mm256_or_si256(ps[i], flat_p[i]); + qs[i] = _mm256_andnot_si256(flat, qs[i]); + flat_q[i] = _mm256_and_si256(flat, flat_q[i]); + q[i] = _mm256_or_si256(qs[i], flat_q[i]); + } + + // highbd_filter16 + + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm256_andnot_si256(flat2, p[i]); + flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm256_andnot_si256(flat2, q[i]); + flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); + q[i] = _mm256_or_si256(q[i], flat2_q[i]); + _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); + _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); + } +} + +static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, + uint16_t *dst, int dst_p) { + __m256i x[16]; + int i; + for (i = 0; i < 16; ++i) { + x[i] = _mm256_loadu_si256((const __m256i *)src); + src += src_p; + } + mm256_transpose_16x16(x, x); + for (i = 0; i < 16; ++i) { + _mm256_storeu_si256((__m256i *)dst, x[i]); + dst += dst_p; + } +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose16x16(s - 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose16x16(t_dst, 16, s - 8, p); +} + +static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, + const uint8_t *t0, const uint8_t *b1, + const uint8_t *l1, const uint8_t *t1, int bd, + __m256i *blt, __m256i *lt, __m256i *thr) { + const __m128i z128 = _mm_setzero_si128(); + const __m128i blimit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); + const __m128i limit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); + const __m128i thresh0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); + const __m128i blimit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); + const __m128i limit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); + const __m128i thresh1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); + + *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); + *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); + *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); + + int shift = bd - 8; + *blt = _mm256_slli_epi16(*blt, shift); + *lt = _mm256_slli_epi16(*lt, shift); + *thr = _mm256_slli_epi16(*thr, shift); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80, tff80, tffe0, t1f, t7f; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + tff80 = _mm256_set1_epi16(0xff80); + tffe0 = _mm256_set1_epi16(0xffe0); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); + } else { // bd == 12 + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); + } + + __m256i ps1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); + __m256i ps0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); + __m256i qs0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); + __m256i qs1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + const __m256i zero = _mm256_setzero_si256(); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + __m256i work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), + _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), + _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // filter4 + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + __m256i work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); + + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + __m256i filter1 = _mm256_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + __m256i filter2 = _mm256_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); + + // Filter1 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm256_srli_epi16(filter1, 3); + work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm256_and_si256(filter1, t1f); // clamp the range + filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter2); + filter2 = _mm256_srli_epi16(filter2, 3); + work_a = _mm256_and_si256(work_a, tffe0); + filter2 = _mm256_and_si256(filter2, t1f); + filter2 = _mm256_or_si256(filter2, work_a); + + // filt >> 1 + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + filt = _mm256_adds_epi16(filter1, one); + work_a = _mm256_cmpgt_epi16(zero, filt); + filt = _mm256_srli_epi16(filt, 1); + work_a = _mm256_and_si256(work_a, tff80); + filt = _mm256_and_si256(filt, t7f); + filt = _mm256_or_si256(filt, work_a); + + filt = _mm256_andnot_si256(hev, filt); + + filter1 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &filter1); + q0 = _mm256_adds_epi16(filter1, t80); + + filter1 = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &filter1); + q1 = _mm256_adds_epi16(filter1, t80); + + filter2 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &filter2); + p0 = _mm256_adds_epi16(filter2, t80); + + filter2 = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &filter2); + p1 = _mm256_adds_epi16(filter2, t80); + + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + } else if (bd == 10) { + t80 = _mm256_set1_epi16(0x200); + } else { // bd == 12 + t80 = _mm256_set1_epi16(0x800); + } + + __m256i ps1, ps0, qs0, qs1; + ps1 = _mm256_subs_epi16(p1, t80); + ps0 = _mm256_subs_epi16(p0, t80); + qs0 = _mm256_subs_epi16(q0, t80); + qs1 = _mm256_subs_epi16(q1, t80); + + // filter_mask and hev_mask + __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i zero = _mm256_set1_epi16(0); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm256_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); + flat = _mm256_max_epi16(work, flat); + flat = _mm256_max_epi16(abs_p1p0, flat); + flat = _mm256_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm256_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); + + flat = _mm256_cmpeq_epi16(flat, zero); + flat = _mm256_and_si256(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + __m256i workp_a, workp_b, workp_shft; + workp_a = + _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); + const __m256i four = _mm256_set1_epi16(4); + workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); + + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); + + // lp filter + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt, filter1, filter2, work_a; + filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + filter1 = _mm256_adds_epi16(filt, t4); + filter2 = _mm256_adds_epi16(filt, t3); + + // Filter1 >> 3 + pixel_clamp(&pmin, &pmax, &filter1); + filter1 = _mm256_srai_epi16(filter1, 3); + + // Filter2 >> 3 + pixel_clamp(&pmin, &pmax, &filter2); + filter2 = _mm256_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm256_adds_epi16(filter1, one); + filt = _mm256_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm256_andnot_si256(hev, filt); + + work_a = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q0 = _mm256_loadu_si256((__m256i *)flat_oq0); + work_a = _mm256_andnot_si256(flat, work_a); + q0 = _mm256_and_si256(flat, q0); + q0 = _mm256_or_si256(work_a, q0); + + work_a = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q1 = _mm256_loadu_si256((__m256i *)flat_oq1); + work_a = _mm256_andnot_si256(flat, work_a); + q1 = _mm256_and_si256(flat, q1); + q1 = _mm256_or_si256(work_a, q1); + + work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + q2 = _mm256_loadu_si256((__m256i *)flat_oq2); + work_a = _mm256_andnot_si256(flat, work_a); + q2 = _mm256_and_si256(flat, q2); + q2 = _mm256_or_si256(work_a, q2); + + work_a = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p0 = _mm256_loadu_si256((__m256i *)flat_op0); + work_a = _mm256_andnot_si256(flat, work_a); + p0 = _mm256_and_si256(flat, p0); + p0 = _mm256_or_si256(work_a, p0); + + work_a = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p1 = _mm256_loadu_si256((__m256i *)flat_op1); + work_a = _mm256_andnot_si256(flat, work_a); + p1 = _mm256_and_si256(flat, p1); + p1 = _mm256_or_si256(work_a, p1); + + work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + p2 = _mm256_loadu_si256((__m256i *)flat_op2); + work_a = _mm256_andnot_si256(flat, work_a); + p2 = _mm256_and_si256(flat, p2); + p2 = _mm256_or_si256(work_a, p2); + + _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); + _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} +#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 76369871b..0a399edf2 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -12,135 +12,135 @@ #include <emmintrin.h> // SSE2 #include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" +#include "aom_dsp/x86/lpf_common_sse2.h" #include "aom_ports/emmintrin_compat.h" +#include "aom_ports/mem.h" -static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { - __m128i ubounded; - __m128i lbounded; - __m128i retval; +static INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + __m128i clamped, mask; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i t80, max, min; + mask = _mm_cmpgt_epi16(*pixel, *max); + clamped = _mm_andnot_si128(mask, *pixel); + mask = _mm_and_si128(mask, *max); + clamped = _mm_or_si128(mask, clamped); - if (bd == 8) { - t80 = _mm_set1_epi16(0x80); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); - } else if (bd == 10) { - t80 = _mm_set1_epi16(0x200); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); - } else { // bd == 12 - t80 = _mm_set1_epi16(0x800); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); - } + mask = _mm_cmpgt_epi16(clamped, *min); + clamped = _mm_and_si128(mask, clamped); + mask = _mm_andnot_si128(mask, *min); + *pixel = _mm_or_si128(clamped, mask); +} - min = _mm_subs_epi16(zero, t80); +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); - ubounded = _mm_cmpgt_epi16(value, max); - lbounded = _mm_cmplt_epi16(value, min); - retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); - ubounded = _mm_and_si128(ubounded, max); - lbounded = _mm_and_si128(lbounded, min); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_or_si128(retval, lbounded); - return retval; -} + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); -// TODO(debargha, peter): Break up large functions into smaller ones -// in this file. -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; - __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; - __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; - __m128i ps1, qs1, ps0, qs0; - __m128i abs_p0q0, abs_p1q1, ffff, work; - __m128i filt, work_a, filter1, filter2; - __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; - __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; - __m128i flat2_q0, flat2_p0; - __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3; - __m128i t4, t3, t80, t1; - __m128i eight, four; + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } +} +// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); +static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, + const __m128i *t, __m128i *hev) { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *t); - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - - // highbd_filter_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i zero = _mm_setzero_si128(); + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); +} - ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); +static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), + _mm_subs_epu16(p[i - 1], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), + _mm_subs_epu16(q[i - 1], q[i]))); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} - // highbd_hev_mask (in C code this is actually called from highbd_filter4) - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, + const __m128i *q, int bd, int start, + int end, __m128i *flat) { + __m128i max = _mm_setzero_si128(); + int i; + for (i = start; i < end; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), + _mm_subs_epu16(p[0], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), + _mm_subs_epu16(q[0], q[i]))); + } - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); + __m128i ft; + if (bd == 8) + ft = _mm_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); - mask = _mm_subs_epu16(mask, limit); - mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} - // lp filter - // highbd_filter4 - t4 = _mm_set1_epi16(4); - t3 = _mm_set1_epi16(3); +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, + const __m128i *th, int bd, __m128i *ps, + __m128i *qs) { + __m128i t80; if (bd == 8) t80 = _mm_set1_epi16(0x80); else if (bd == 10) @@ -148,340 +148,283 @@ void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, else // bd == 12 t80 = _mm_set1_epi16(0x800); - t1 = _mm_set1_epi16(0x1); + __m128i ps0 = _mm_subs_epi16(p[0], t80); + __m128i ps1 = _mm_subs_epi16(p[1], t80); + __m128i qs0 = _mm_subs_epi16(q[0], t80); + __m128i qs1 = _mm_subs_epi16(q[1], t80); - ps1 = _mm_subs_epi16(p1, t80); - qs1 = _mm_subs_epi16(q1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, t80); - filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), - hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); - filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); - // Filter1 >> 3 - filter1 = _mm_srai_epi16(filter1, 0x3); - filter2 = _mm_srai_epi16(filter2, 0x3); + __m128i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm_and_si128(filter, hev); - qs0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - ps0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); - qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); - // end highbd_filter4 - // loopfilter done + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); - // highbd_flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); - work = _mm_max_epi16(abs_p1p0, abs_q1q0); - flat = _mm_max_epi16(work, flat); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); - flat = _mm_cmpeq_epi16(flat, zero); - // end flat_mask4 + qs[0] = _mm_adds_epi16(qs0, t80); + ps[0] = _mm_adds_epi16(ps0, t80); - // flat & mask = flat && mask (as used in filter8) - // (because, in both vars, each block of 16 either all 1s or all 0s) - flat = _mm_and_si128(flat, mask); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); - // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 - // but referred to as p0-p4 & q0-q4 in fn) - flat2 = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), - _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + qs[1] = _mm_adds_epi16(qs1, t80); + ps[1] = _mm_adds_epi16(ps1, t80); +} - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), - _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); - flat2 = _mm_max_epi16(work, flat2); +typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), - _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); - flat2 = _mm_max_epi16(work, flat2); +static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd, + PixelOutput pixel_output) { + __m128i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), - _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); - flat2 = _mm_max_epi16(work, flat2); + __m128i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); - if (bd == 8) - flat2 = _mm_subs_epu16(flat2, one); - else if (bd == 10) - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + __m128i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m128i flat, flat2; + const __m128i one = _mm_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); - flat2 = _mm_cmpeq_epi16(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - // end highbd_flat_mask5 + __m128i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations - eight = _mm_set1_epi16(8); - four = _mm_set1_epi16(4); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - flat2_p0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); - flat2_q0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); - flat_p0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); - flat_q0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); - - sum_p7 = _mm_add_epi16(p7, p7); - sum_q7 = _mm_add_epi16(q7, q7); - sum_p3 = _mm_add_epi16(p3, p3); - sum_q3 = _mm_add_epi16(q3, q3); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); - flat2_p1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); - flat2_q1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); - flat_p1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); - flat_q1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - sum_p3 = _mm_add_epi16(sum_p3, p3); - sum_q3 = _mm_add_epi16(sum_q3, q3); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); - flat2_p2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); - flat2_q2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); - flat_p2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); - flat_q2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); - flat2_p3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); - flat2_q3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); - flat2_p4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); - flat2_q4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); - flat2_p5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); - flat2_q5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); - flat2_p6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); - flat2_q6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // highbd_filter8 - p2 = _mm_andnot_si128(flat, p2); + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[7], flat2_q[7]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i sum_p = + _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); + __m128i sum_q = + _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); + + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); + flat2_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + + __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); + __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p, p[6]); + sum_p = _mm_sub_epi16(sum_p, q[6]); + flat2_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + + sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_q, p[5]); + flat2_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p = _mm_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) - flat_p2 = _mm_and_si128(flat, flat_p2); + flat_p[2] = _mm_and_si128(flat, flat_p[2]); // when (flat && mask) - p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values - - ps1 = _mm_andnot_si128(flat, ps1); - // p1 takes the value assigned to in in filter4 if !(flat && mask) - flat_p1 = _mm_and_si128(flat, flat_p1); - // when (flat && mask) - p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values - qs1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values - - ps0 = _mm_andnot_si128(flat, ps0); - // p0 takes the value assigned to in in filter4 if !(flat && mask) - flat_p0 = _mm_and_si128(flat, flat_p0); - // when (flat && mask) - p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values - qs0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values - // end highbd_filter8 + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } // highbd_filter16 - p6 = _mm_andnot_si128(flat2, p6); - // p6 remains unchanged if !(flat2 && flat && mask) - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - // get values for when (flat2 && flat && mask) - p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values - q6 = _mm_andnot_si128(flat2, q6); - // q6 remains unchanged if !(flat2 && flat && mask) - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - // get values for when (flat2 && flat && mask) - q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); - - p5 = _mm_andnot_si128(flat2, p5); - // p5 remains unchanged if !(flat2 && flat && mask) - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - // get values for when (flat2 && flat && mask) - p5 = _mm_or_si128(p5, flat2_p5); - // full list of p5 values - q5 = _mm_andnot_si128(flat2, q5); - // q5 remains unchanged if !(flat2 && flat && mask) - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - // get values for when (flat2 && flat && mask) - q5 = _mm_or_si128(q5, flat2_q5); - // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); - - p4 = _mm_andnot_si128(flat2, p4); - // p4 remains unchanged if !(flat2 && flat && mask) - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - // get values for when (flat2 && flat && mask) - p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values - q4 = _mm_andnot_si128(flat2, q4); - // q4 remains unchanged if !(flat2 && flat && mask) - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - // get values for when (flat2 && flat && mask) - q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); - - p3 = _mm_andnot_si128(flat2, p3); - // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - // get values for when (flat2 && flat && mask) - p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values - q3 = _mm_andnot_si128(flat2, q3); - // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - // get values for when (flat2 && flat && mask) - q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); - - p2 = _mm_andnot_si128(flat2, p2); - // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - // get values for when (flat2 && flat && mask) - p2 = _mm_or_si128(p2, flat2_p2); - // full list of p2 values - q2 = _mm_andnot_si128(flat2, q2); - // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - // get values for when (flat2 && flat && mask) - q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); - - p1 = _mm_andnot_si128(flat2, p1); - // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - // get values for when (flat2 && flat && mask) - p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values - q1 = _mm_andnot_si128(flat2, q1); - // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - // get values for when (flat2 && flat && mask) - q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - - p0 = _mm_andnot_si128(flat2, p0); - // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - // get values for when (flat2 && flat && mask) - p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values - q0 = _mm_andnot_si128(flat2, q0); - // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - // get values for when (flat2 && flat && mask) - q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + + if (pixel_output == FOUR_PIXELS) { + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); + } + } else { // EIGHT_PIXELS + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } + } +} + +// Note: +// highbd_lpf_horz_edge_8_8p() output 8 pixels per register +// highbd_lpf_horz_edge_8_4p() output 4 pixels per register +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); +} +#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + +static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); +} + +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); +#endif } void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { - aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); - aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); + highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); +#endif +} + +static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, + const __m128i *p0, const __m128i *q0, + const __m128i *q1, const __m128i *q2, + int p, uint16_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); +#else + _mm_store_si128((__m128i *)(s - 3 * p), *p2); + _mm_store_si128((__m128i *)(s - 2 * p), *p1); + _mm_store_si128((__m128i *)(s - 1 * p), *p0); + _mm_store_si128((__m128i *)(s + 0 * p), *q0); + _mm_store_si128((__m128i *)(s + 1 * p), *q1); + _mm_store_si128((__m128i *)(s + 2 * p), *q2); +#endif } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -497,14 +440,14 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -635,41 +578,48 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); // lp filter - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = signed_char_clamp_bd_sse2(filt, bd); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi16(filt, t4); filter2 = _mm_adds_epi16(filt, t3); // Filter1 >> 3 - filter1 = signed_char_clamp_bd_sse2(filter1, bd); + pixel_clamp(&pmin, &pmax, &filter1); filter1 = _mm_srai_epi16(filter1, 3); // Filter2 >> 3 - filter2 = signed_char_clamp_bd_sse2(filter2, bd); + pixel_clamp(&pmin, &pmax, &filter2); filter2 = _mm_srai_epi16(filter2, 3); // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; filt = _mm_andnot_si128(hev, filt); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q0 = _mm_load_si128((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q1 = _mm_load_si128((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); @@ -682,14 +632,16 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p0 = _mm_load_si128((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p1 = _mm_load_si128((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); @@ -702,12 +654,7 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); } void aom_highbd_lpf_horizontal_8_dual_sse2( @@ -725,14 +672,18 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -743,7 +694,7 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - __m128i work; + const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); __m128i t80; @@ -814,9 +765,9 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); mask = _mm_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16( + +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) + __m128i work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); mask = _mm_max_epi16(work, mask); @@ -824,22 +775,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); +#endif mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); // filter4 - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); // (aom_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + filter1 = _mm_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + + filter2 = _mm_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); // Filter1 >> 3 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 @@ -865,19 +826,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, filt = _mm_andnot_si128(hev, filt); - q0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - p0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); - + q0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &q0); + q0 = _mm_adds_epi16(q0, t80); + + q1 = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &q1); + q1 = _mm_adds_epi16(q1, t80); + + p0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &p0); + p0 = _mm_adds_epi16(p0, t80); + + p1 = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &p1); + p1 = _mm_adds_epi16(p1, t80); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); +#else _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +#endif } void aom_highbd_lpf_horizontal_4_dual_sse2( @@ -888,118 +862,6 @@ void aom_highbd_lpf_horizontal_4_dual_sse2( aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } -static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], - int out_p, int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; - do { - uint16_t *in = src[idx8x8]; - uint16_t *out = dst[idx8x8]; - - p0 = - _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - p1 = - _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - p2 = - _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - p3 = - _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - p4 = - _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - p5 = - _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - p6 = - _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - p7 = - _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 - x0 = _mm_unpacklo_epi16(p0, p1); - // 20 30 21 31 22 32 23 33 - x1 = _mm_unpacklo_epi16(p2, p3); - // 40 50 41 51 42 52 43 53 - x2 = _mm_unpacklo_epi16(p4, p5); - // 60 70 61 71 62 72 63 73 - x3 = _mm_unpacklo_epi16(p6, p7); - // 00 10 20 30 01 11 21 31 - x4 = _mm_unpacklo_epi32(x0, x1); - // 40 50 60 70 41 51 61 71 - x5 = _mm_unpacklo_epi32(x2, x3); - // 00 10 20 30 40 50 60 70 - x6 = _mm_unpacklo_epi64(x4, x5); - // 01 11 21 31 41 51 61 71 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); - // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); - // 01 11 21 31 41 51 61 71 - - // 02 12 22 32 03 13 23 33 - x4 = _mm_unpackhi_epi32(x0, x1); - // 42 52 62 72 43 53 63 73 - x5 = _mm_unpackhi_epi32(x2, x3); - // 02 12 22 32 42 52 62 72 - x6 = _mm_unpacklo_epi64(x4, x5); - // 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); - // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); - // 03 13 23 33 43 53 63 73 - - // 04 14 05 15 06 16 07 17 - x0 = _mm_unpackhi_epi16(p0, p1); - // 24 34 25 35 26 36 27 37 - x1 = _mm_unpackhi_epi16(p2, p3); - // 44 54 45 55 46 56 47 57 - x2 = _mm_unpackhi_epi16(p4, p5); - // 64 74 65 75 66 76 67 77 - x3 = _mm_unpackhi_epi16(p6, p7); - // 04 14 24 34 05 15 25 35 - x4 = _mm_unpacklo_epi32(x0, x1); - // 44 54 64 74 45 55 65 75 - x5 = _mm_unpacklo_epi32(x2, x3); - // 04 14 24 34 44 54 64 74 - x6 = _mm_unpacklo_epi64(x4, x5); - // 05 15 25 35 45 55 65 75 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); - // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); - // 05 15 25 35 45 55 65 75 - - // 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi32(x0, x1); - // 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi32(x2, x3); - // 06 16 26 36 46 56 66 76 - x6 = _mm_unpacklo_epi64(x4, x5); - // 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); - // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); - // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} - -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, - uint16_t *out, int out_p) { - uint16_t *src0[1]; - uint16_t *src1[1]; - uint16_t *dest0[1]; - uint16_t *dest1[1]; - src0[0] = in0; - src1[0] = in1; - dest0[0] = out; - dest1[0] = out + 8; - highbd_transpose(src0, in_p, dest0, out_p, 1); - highbd_transpose(src1, in_p, dest1, out_p, 1); -} - void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { @@ -1130,10 +992,12 @@ void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - // Loop filtering +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); +#else aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); - +#endif // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 9c3bbdd69..855bc6558 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -293,4 +293,6 @@ HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 248b98ef5..760e68aab 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -158,7 +158,10 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 - +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 +%endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -302,6 +305,8 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 %endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c index 7bc8a0df3..befd81269 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -177,177 +177,94 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); } -static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 8; - src += 8; - pred += 8; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 16; - src += 16; - pred += 16; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 32; - src += 32; - pred += 32; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 64; - src += 64; - pred += 64; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +#endif +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); } +SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); } +#endif static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { - SubtractWxHFuncType ret_func_ptr = NULL; if (rows == 4) { - if (cols == 4) { - ret_func_ptr = subtract_4x4; - } else if (cols == 8) { - ret_func_ptr = subtract_8x4; - } - } else if (rows == 8) { - if (cols == 4) { - ret_func_ptr = subtract_4x8; - } else if (cols == 8) { - ret_func_ptr = subtract_8x8; - } else if (cols == 16) { - ret_func_ptr = subtract_16x8; - } - } else if (rows == 16) { - if (cols == 8) { - ret_func_ptr = subtract_8x16; - } else if (cols == 16) { - ret_func_ptr = subtract_16x16; - } else if (cols == 32) { - ret_func_ptr = subtract_32x16; - } - } else if (rows == 32) { - if (cols == 16) { - ret_func_ptr = subtract_16x32; - } else if (cols == 32) { - ret_func_ptr = subtract_32x32; - } else if (cols == 64) { - ret_func_ptr = subtract_64x32; - } - } else if (rows == 64) { - if (cols == 32) { - ret_func_ptr = subtract_32x64; - } else if (cols == 64) { - ret_func_ptr = subtract_64x64; - } else if (cols == 128) { - ret_func_ptr = subtract_128x64; - } - } else if (rows == 128) { - if (cols == 64) { - ret_func_ptr = subtract_64x128; - } else if (cols == 128) { - ret_func_ptr = subtract_128x128; - } + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x32; +#endif // CONFIG_EXT_PARTITION + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x64; +#endif // CONFIG_EXT_PARTITION } - if (!ret_func_ptr) { - assert(0); +#if CONFIG_EXT_PARTITION + if (rows == 128) { + if (cols == 32) return subtract_32x128; + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; } - return ret_func_ptr; +#endif // CONFIG_EXT_PARTITION + assert(0); + return NULL; } void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 93923ffb0..62acf3ed3 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -189,6 +189,8 @@ VAR_FN(8, 8, 8, 6); VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); VAR_FN(32, 8, 16, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); #endif #undef VAR_FN @@ -411,7 +413,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)) + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ @@ -588,7 +592,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)); #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 000000000..6b8922b8c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle + +// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. +// Use a header file, intrapred_common_x86.h +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint32_t sum = _mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm index 02567db49..9aece27be 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm @@ -623,149 +623,3 @@ cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left lea dstq, [dstq+strideq*4] jnz .loop REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x - punpcklbw m0, m1 - pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] - psrldq m0, 2 - psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] - movd m2, [leftq] - punpcklbw m2, m1 - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - pshuflw m4, m2, 0xaa - pshuflw m3, m2, 0xff - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movq m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] - pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] - DEFINE_ARGS dst, stride, line, left - mov lineq, -4 - punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] - psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] - movq m2, [leftq] - punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] -.loop: - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] - punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m3 - movq [dstq ], m4 - movhps [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m2, 4 - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left - pxor m1, m1 - mova m2, [aboveq-16]; - mova m0, [aboveq] ; t1 t2 ... t16 [byte] - punpckhbw m2, m1 ; [127:112] tl [word] - punpckhbw m4, m0, m1 - punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] - DEFINE_ARGS dst, stride, line, left, stride8 - mov lineq, -8 - pshufhw m2, m2, 0xff - mova m3, [leftq] ; l1 l2 ... l16 [byte] - punpckhqdq m2, m2 ; tl repeated 8 times [word] - psubw m0, m2 - psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] - punpckhbw m5, m3, m1 - punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] - lea stride8q, [strideq*8] -.loop: - pshuflw m6, m3, 0x0 - pshuflw m7, m5, 0x0 - punpcklqdq m6, m6 ; l1 repeated 8 times [word] - punpcklqdq m7, m7 ; l8 repeated 8 times [word] - paddw m1, m6, m0 - paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] - psrldq m5, 2 - packuswb m1, m6 - mova [dstq ], m1 - paddw m1, m7, m0 - paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] - psrldq m3, 2 - packuswb m1, m7 - mova [dstq+stride8q], m1 - inc lineq - lea dstq, [dstq+strideq] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - mova m4, [aboveq+16] - punpcklbw m2, m1 - punpckhbw m3, m0, m1 - punpckhbw m5, m4, m1 - punpcklbw m0, m1 - punpcklbw m4, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - punpcklqdq m2, m2 - add leftq, 32 - psubw m0, m2 - psubw m3, m2 - psubw m4, m2 - psubw m5, m2 -.loop: - movd m2, [leftq+lineq*2] - pxor m1, m1 - punpcklbw m2, m1 - pshuflw m7, m2, 0x55 - pshuflw m2, m2, 0x0 - punpcklqdq m2, m2 - punpcklqdq m7, m7 - paddw m6, m2, m3 - paddw m1, m2, m0 - packuswb m1, m6 - mova [dstq ], m1 - paddw m6, m2, m5 - paddw m1, m2, m4 - packuswb m1, m6 - mova [dstq+16 ], m1 - paddw m6, m7, m3 - paddw m1, m7, m0 - packuswb m1, m6 - mova [dstq+strideq ], m1 - paddw m6, m7, m5 - paddw m1, m7, m4 - packuswb m1, m6 - mova [dstq+strideq+16], m1 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 000000000..2a83b9001 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "./aom_dsp_rtcd.h" + +static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { + int i; + for (i = 0; i < 4; ++i) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_16(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4x8(pred, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + (void)left; + int i; + for (i = 0; i < 16; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + int i = 0; + + do { + left_col = _mm_load_si128((const __m128i *)left); + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < 2); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 000000000..85b82744e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,885 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16(0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); + pixels[1] = _mm_loadl_epi64((const __m128i *)left); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vecotr +// weights[2]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + const __m128i zero = _mm_setzero_si128(); + + weights[0] = _mm_unpacklo_epi8(t, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]); + + if (height == 8) { + t = _mm_srli_si128(t, 4); + weights[0] = _mm_unpacklo_epi8(t, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, weight[2]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 4, weights); + + smooth_pred_4xh(pixels, weights, 4, dst, stride); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 8, weights); + + smooth_pred_4xh(pixels, weights, 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + pixels[2] = _mm_load_si128((const __m128i *)left); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 4 : 8; + __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, 1/4 +// pixels[1]: above and below_pred interleave vector, 2/4 +// pixels[2]: above and below_pred interleave vector, 3/4 +// pixels[3]: above and below_pred interleave vector, 3/4 +// pixels[4]: left vector +// pixels[5]: left vector, h = 32 only +// pixels[6]: right_pred vector +static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab = _mm_load_si128((const __m128i *)above); + pixels[6] = _mm_set1_epi16((uint16_t)above[15]); + pixels[4] = _mm_load_si128((const __m128i *)left); + pixels[5] = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// ... ... +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +// ... ... +static INLINE void load_weight_w16(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 8) { + weight_h[0] = _mm_unpacklo_epi8(w8, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + } + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[4] : pixels[5]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc); + __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc); + + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[6]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + __m128i sum2 = _mm_madd_epi16(b, ww[2]); + __m128i sum3 = _mm_madd_epi16(b, ww[3]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + s2 = _mm_add_epi32(s2, sum2); + s2 = _mm_add_epi32(s2, round); + s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale); + + s3 = _mm_add_epi32(s3, sum3); + s3 = _mm_add_epi32(s3, round); + s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + sum1 = _mm_packus_epi16(s2, s3); + sum1 = _mm_shuffle_epi8(sum1, gat); + + _mm_storel_epi64((__m128i *)dst, sum0); + _mm_storel_epi64((__m128i *)(dst + 8), sum1); + + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 8, pixels); + + __m128i wh[2], ww[4]; + load_weight_w16(sm_weight_arrays, 8, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 16, pixels); + + __m128i wh[4], ww[4]; + load_weight_w16(sm_weight_arrays, 16, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 32, pixels); + + __m128i wh[8], ww[4]; + load_weight_w16(sm_weight_arrays, 32, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3); +} + +static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab0 = _mm_load_si128((const __m128i *)above); + __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16)); + + pixels[10] = _mm_set1_epi16((uint16_t)above[31]); + pixels[8] = _mm_load_si128((const __m128i *)left); + pixels[9] = _mm_load_si128((const __m128i *)(left + 16)); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab0, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab0, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpacklo_epi8(ab1, zero); + pixels[4] = _mm_unpacklo_epi16(x, bp); + pixels[5] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab1, zero); + pixels[6] = _mm_unpacklo_epi16(x, bp); + pixels[7] = _mm_unpackhi_epi16(x, bp); +} + +static INLINE void load_weight_w32(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w32_0, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_0, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpacklo_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[4] = _mm_unpacklo_epi16(x, y); + weight_w[5] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[6] = _mm_unpacklo_epi16(x, y); + weight_w[7] = _mm_unpackhi_epi16(x, y); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + + weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]); + weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]); + weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]); + weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]); + } +} + +static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[8] : pixels[9]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + + int j; + __m128i s[8]; + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[10]); + + for (j = 0; j < 8; ++j) { + s[j] = _mm_madd_epi16(pixels[j], wh_sc); + s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j])); + s[j] = _mm_add_epi32(s[j], round); + s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale); + } + + for (j = 0; j < 8; j += 2) { + __m128i sum = _mm_packus_epi16(s[j], s[j + 1]); + sum = _mm_shuffle_epi8(sum, gat); + _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum); + } + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 16, pixels); + + __m128i wh[4], ww[8]; + load_weight_w32(sm_weight_arrays, 16, wh, ww); + + smooth_pred_32x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 32, pixels); + + __m128i wh[8], ww[8]; + load_weight_w32(sm_weight_arrays, 32, wh, ww); + + smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); +} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h index 4238e651b..26c5cfe59 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h @@ -18,17 +18,17 @@ #include "aom_dsp/x86/txfm_common_avx2.h" static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { -#if CONFIG_HIGHBITDEPTH - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); -#else - *in = _mm256_loadu_si256((const __m256i *)coeff); -#endif + if (sizeof(tran_low_t) == 4) { + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); + } else { + *in = _mm256_loadu_si256((const __m256i *)coeff); + } } static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h index 95d246c3c..342816977 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h @@ -133,12 +133,12 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { // Function to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data(const tran_low_t *data) { -#if CONFIG_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); -#else - return _mm_load_si128((const __m128i *)data); -#endif + if (sizeof(tran_low_t) == 4) { + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); + } else { + return _mm_load_si128((const __m128i *)data); + } } static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index 7e134dc63..8343dbbed 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -178,10 +178,20 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, #endif // !CONFIG_PARALLEL_DEBLOCKING FILTER4; +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 8); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0); + + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 8); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0); +#else _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +#endif } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -267,8 +277,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); +#if !CONFIG_PARALLEL_DEBLOCKING // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); +#endif // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); @@ -279,7 +291,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); ps1ps0 = _mm_srli_si128(ps1ps0, 4); *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - +#if !CONFIG_PARALLEL_DEBLOCKING *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); @@ -287,6 +299,19 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +#endif +} + +static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num, + uint8_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x); + const __m128i hi = _mm_srli_si128(*x, 8); + *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi); +#else + _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x); + _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x)); +#endif } void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, @@ -580,44 +605,37 @@ void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + store_buffer_horz_8(&q6p6, p, 6, s); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + store_buffer_horz_8(&q5p5, p, 5, s); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + store_buffer_horz_8(&q4p4, p, 4, s); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + store_buffer_horz_8(&q3p3, p, 3, s); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + store_buffer_horz_8(&q2p2, p, 2, s); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + store_buffer_horz_8(&q1p1, p, 1, s); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + store_buffer_horz_8(&q0p0, p, 0, s); } } @@ -651,10 +669,33 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput; + +static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x, + int p, int offset, uint8_t *s) { + int i; + if (pixel_num == FOUR_PIXELS) { + for (i = 13; i >= 0; i--) { + *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]); + } + } + if (pixel_num == EIGHT_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]); + } + } + if (pixel_num == SIXTEEN_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]); + } + } +} + +static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num, + unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -910,73 +951,62 @@ void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + __m128i x[14]; + x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + + store_buffer_horz_16(pixel_num, x, p, 6, s); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1186,15 +1216,35 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1); + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0); + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1); + *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2); +#else _mm_storel_epi64((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s - 2 * p), p1); _mm_storel_epi64((__m128i *)(s - 1 * p), p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q0); _mm_storel_epi64((__m128i *)(s + 1 * p), q1); _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +#endif } } +void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { +#if CONFIG_PARALLEL_DEBLOCKING + lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); +#else + lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); +#endif +} + void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 000000000..027c890dc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H +#define _AOM_DSP_X86_LPF_COMMON_X86_H + +#include <emmintrin.h> // SSE2 + +#include "./aom_config.h" + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} +#endif // _AOM_DSP_X86_LPF_COMMON_X86_H diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 6a73ac460..2536f91d2 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -98,7 +98,13 @@ MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) -#endif +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(32, 128) +MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -294,7 +300,13 @@ HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) -#endif +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(32, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index 24e7ed1c6..3ffe132be 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -131,7 +131,13 @@ MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) -#endif +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR_SSSE3(128, 32) +MASK_SUBPIX_VAR_SSSE3(32, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -712,6 +718,12 @@ HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32) +#endif #endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 3fd6f71e5..52dd508ec 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -142,6 +142,8 @@ OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) #endif //////////////////////////////////////////////////////////////////////////////// @@ -271,5 +273,7 @@ HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) #endif #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 44cfa8e28..392616af3 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -151,7 +151,13 @@ OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) -#endif +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +OBMCVARWXH(32, 128) +OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES //////////////////////////////////////////////////////////////////////////////// // High bit-depth @@ -364,5 +370,11 @@ HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) -#endif +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(32, 128) +HBD_OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 890c1f01e..0e7f679d0 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -16,29 +16,29 @@ #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif + if (sizeof(tran_low_t) == 4) { + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); + } else { + return _mm_load_si128((const __m128i *)coeff_ptr); + } } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif + if (sizeof(tran_low_t) == 4) { + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); + } else { + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); + } } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 4570e2ce6..2c67f450f 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -256,4 +256,6 @@ SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index 88d427077..b4cc6abf1 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -163,6 +163,10 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 +%endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -261,6 +265,8 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 %endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index 4f7a60c22..1a8fed710 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -15,6 +15,7 @@ #include <immintrin.h> #include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/common_avx2.h" #define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ @@ -34,135 +35,6 @@ static INLINE void mm256_reverse_epi16(__m256i *u) { *u = _mm256_permute2x128_si256(v, v, 1); } -// Note: in and out could have the same value -static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { - __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); - __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); - __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); - __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); - __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); - - __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); - __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); - __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); - __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); - __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); - __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); - __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); - __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); - - // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b - // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f - // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b - // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f - // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b - // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f - // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b - // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f - - // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b - // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f - // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb - // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf - // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db - // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df - // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb - // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff - - __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); - __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); - __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); - __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); - __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); - __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); - __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); - __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); - - __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); - __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); - __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); - __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); - __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); - __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); - __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); - __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); - - // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 - // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b - // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d - // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f - // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 - // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b - // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d - // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f - - // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 - // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb - // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd - // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf - // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 - // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb - // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd - // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff - - tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - - tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); - tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); - tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); - tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); - tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); - tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); - tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); - tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); - - // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 - // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 - // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a - // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b - // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c - // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d - // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e - // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f - - // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 - // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 - // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa - // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb - // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc - // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd - // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe - // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - - out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); -} - static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, const __m256i *cospi) { const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h index e4ac56339..4e6eecd32 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h @@ -16,16 +16,16 @@ // This header file should be put below any x86 intrinsics head file static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); + } } #endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 918844185..211fad3f8 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -382,6 +382,28 @@ unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, assert(sum >= -255 * 32 * 8); return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); } + +unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 16 * 64); + assert(sum >= -255 * 16 * 64); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 16); + assert(sum >= -255 * 64 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} #endif // The 2 unused parameters are place holders for PIC enabled build. @@ -451,7 +473,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ @@ -543,7 +567,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |