diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:00 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:03 -0500 |
commit | d2499ead93dc4298c0882fe98902acb1b5209f99 (patch) | |
tree | cb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c | |
parent | 41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff) | |
download | UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip |
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 707 |
1 files changed, 392 insertions, 315 deletions
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 83e0098ba..097e0778f 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, const unsigned char *lt, const unsigned char *thr, int bd) { int i; + const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i t80; get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); @@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3], flat_pq[3]; - __m128i flat2_p[6], flat2_q[6]; - __m128i flat2_pq[6]; - { - __m128i work0; + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); @@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); - flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); - flat2_q[0] = - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); - __m128i sum_p6, sum_p3; sum_p6 = _mm_add_epi16(pq[6], pq[6]); sum_p3 = _mm_add_epi16(pq[3], pq[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - flat2_p[1] = _mm_add_epi16(sum_p, work0); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); work0 = _mm_add_epi16(sum_p3, pq[1]); @@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); sum_p3 = _mm_add_epi16(sum_p3, pq[3]); work0 = _mm_add_epi16(sum_p3, pq[2]); @@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - } - - // highbd_filter8 - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - - for (i = 0; i < 3; i++) { - pq[i] = _mm_andnot_si128(flat, pq[i]); - flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat_pq[i]); - } - - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - pq[i] = _mm_andnot_si128(flat2, pq[i]); - flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); - // get values for when (flat2 && flat && mask) - pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); } } @@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, const uint8_t *thr1, int bd) { __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, &t80); __m128i mask; @@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i ps[2], qs[2]; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - { + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - flat2_p[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), - _mm_add_epi16(p[1], q[0]))), - 4); - flat2_q[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), - _mm_add_epi16(p[0], q[1]))), - 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = @@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); - flat2_p[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), - 4); - flat2_q[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), - 4); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p3 = _mm_add_epi16(sum_p3, p[3]); - sum_q3 = _mm_add_epi16(sum_q3, q[3]); - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - flat2_p[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), - 4); - flat2_q[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), - 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - flat2_p[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), - 4); - flat2_q[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - flat2_p[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), - 4); - flat2_q[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - flat2_p[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), - 4); - flat2_q[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), - 4); - } - // highbd_filter8 - p[2] = _mm_andnot_si128(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm_and_si128(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm_andnot_si128(flat, q[2]); - flat_q[2] = _mm_and_si128(flat, flat_q[2]); - q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; } } @@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); @@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), - _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 - workp_a = - _mm_add_epi16(workp_a, - workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 - - flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), - *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 - workp_b = _mm_add_epi16(*q1, *q2); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), - *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); - workp_shft1 = _mm_add_epi16( - workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); - } - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( @@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + // flat_mask flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); flat = _mm_max_epi16(flat, work); @@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_shft0, workp_shft1; // op1 @@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( workp_shft1 = _mm_add_epi16( workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 oq1 = _mm_srli_epi16(workp_shft1, 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } } void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, @@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( __m128i mask, hev, flat; __m128i pq[4]; __m128i p1p0, q1q0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); @@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask4 flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); @@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); @@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( @@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); flat = _mm_max_epi16(work1, flat); work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); flat = _mm_max_epi16(work0, flat); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b; // Added before shift for rounding part of ROUND_POWER_OF_TWO @@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); - - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, |