summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-19 23:05:00 -0500
committertrav90 <travawine@palemoon.org>2018-10-19 23:05:03 -0500
commitd2499ead93dc4298c0882fe98902acb1b5209f99 (patch)
treecb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
parent41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff)
downloadUXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar
UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz
UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz
UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz
UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c')
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c707
1 files changed, 392 insertions, 315 deletions
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 83e0098ba..097e0778f 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
const unsigned char *lt, const unsigned char *thr, int bd) {
int i;
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit, limit, thresh;
__m128i t80;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
@@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat2 = _mm_unpacklo_epi64(flat2, flat2);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3], flat_pq[3];
- __m128i flat2_p[6], flat2_q[6];
- __m128i flat2_pq[6];
- {
- __m128i work0;
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3], flat_pq[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i flat2_pq[6];
+ __m128i sum_p6, sum_p3;
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
__m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
__m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
sum_p = _mm_add_epi16(sum_p, sum_lp);
@@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i sum_lq = _mm_srli_si128(sum_lp, 8);
__m128i sum_q = _mm_srli_si128(sum_p, 8);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
- flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
- flat2_q[0] =
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
-
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
- __m128i sum_p6, sum_p3;
sum_p6 = _mm_add_epi16(pq[6], pq[6]);
sum_p3 = _mm_add_epi16(pq[3], pq[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
+ sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q[5]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- flat2_p[1] = _mm_add_epi16(sum_p, work0);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ work0_1 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
work0 = _mm_add_epi16(sum_p3, pq[1]);
@@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
- flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
work0 = _mm_add_epi16(sum_p3, pq[2]);
@@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- }
-
- // highbd_filter8
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
- for (i = 0; i < 3; i++) {
- pq[i] = _mm_andnot_si128(flat, pq[i]);
- flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
- }
-
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- pq[i] = _mm_andnot_si128(flat2, pq[i]);
- flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
- // get values for when (flat2 && flat && mask)
- pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ } // flat2
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // highbd_filter8
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+ for (i = 0; i < 3; i++) {
+ pq[i] = _mm_andnot_si128(flat, pq[i]);
+ flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ pq[i] = _mm_andnot_si128(flat2, pq[i]);
+ flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ }
+ }
+ } else {
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
}
}
@@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
const uint8_t *thr1, int bd) {
__m128i blimit, limit, thresh, t80;
+ const __m128i zero = _mm_setzero_si128();
+
get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
&t80);
__m128i mask;
@@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i ps[2], qs[2];
highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
- {
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+ __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
__m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
__m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
+ sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
__m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
sum_q = _mm_add_epi16(sum_q, sum_lq);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat2_p[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0]))),
- 4);
- flat2_q[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1]))),
- 4);
flat_p[0] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
flat_q[0] =
@@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
__m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
__m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
- flat2_p[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
- 4);
- flat2_q[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
- 4);
+
+ sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+ __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
sum_lq = _mm_sub_epi16(sum_lp, p[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
flat_p[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
flat_q[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
- 4);
- flat2_q[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
- 4);
+
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
flat_p[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
flat_q[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
- 4);
- flat2_q[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
- 4);
- flat2_q[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
- 4);
- flat2_q[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
- 4);
- }
- // highbd_filter8
- p[2] = _mm_andnot_si128(flat, p[2]);
- // p2 remains unchanged if !(flat && mask)
- flat_p[2] = _mm_and_si128(flat, flat_p[2]);
- // when (flat && mask)
- p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
- q[2] = _mm_andnot_si128(flat, q[2]);
- flat_q[2] = _mm_and_si128(flat, flat_q[2]);
- q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
- int i;
- for (i = 1; i >= 0; i--) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- p[i] = _mm_andnot_si128(flat2, p[i]);
- flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
- // get values for when (flat2 && flat && mask)
- p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
- q[i] = _mm_andnot_si128(flat2, q[i]);
- flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
- q[i] = _mm_or_si128(q[i], flat2_q[i]);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+ _mm_add_epi16(p[1], q[0]))),
+ 4);
+ flat2_q[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+ _mm_add_epi16(p[0], q[1]))),
+ 4);
+
+ flat2_p[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+ 4);
+ flat2_q[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, p[4]);
+ flat2_p[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+ 4);
+ flat2_q[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, p[3]);
+ flat2_p[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+ 4);
+ flat2_q[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, p[2]);
+ flat2_p[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+ 4);
+ flat2_q[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, p[1]);
+ flat2_p[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+ 4);
+ flat2_q[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+ 4);
+ }
+ // highbd_filter8
+ int i;
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ p[2] = _mm_andnot_si128(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ // highbd_filter16
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ }
+ }
+ } else {
+ p[0] = ps[0];
+ q[0] = qs[0];
+ p[1] = ps[1];
+ q[1] = qs[1];
}
}
@@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
// flat_mask
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
@@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
// op1
- workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
- _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
- workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
// op0
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
- workp_a =
- _mm_add_epi16(workp_a,
- workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-
- flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_p1p0 = _mm_srli_epi16(workp_b, 3);
// oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
- *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
- workp_b = _mm_add_epi16(*q1, *q2);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+ pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
// oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
- *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+ pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
workp_b = _mm_add_epi16(*q2, *q2);
- workp_shft1 = _mm_add_epi16(
- workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
- flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
- }
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_q0q1 = _mm_srli_epi16(workp_a, 3);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
}
static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
@@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
// flat_mask
flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
flat = _mm_max_epi16(flat, work);
@@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
// op1
@@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
workp_shft1 = _mm_add_epi16(
workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
oq1 = _mm_srli_epi16(workp_shft1, 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
}
void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
@@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
__m128i mask, hev, flat;
__m128i pq[4];
__m128i p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+ __m128i work_a, opq2, flat_p1p0, flat_q0q1;
pq[0] = _mm_unpacklo_epi64(*p0, *q0);
pq[1] = _mm_unpacklo_epi64(*p1, *q1);
@@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
// flat_mask4
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
flat = _mm_max_epi16(abs_p1p0, flat);
@@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
// o*p2
workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ workp_c = _mm_add_epi16(workp_a, workp_c);
// o*p1
workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
@@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// oq2
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
+ workp_a = _mm_add_epi16(workp_a, workp_b);
+ opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
-
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
+ work_a = _mm_andnot_si128(flat, pq[2]);
+ *p2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ *q2 = _mm_srli_si128(*p2, 8);
+ }
}
static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
@@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
flat = _mm_max_epi16(work1, flat);
work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
flat = _mm_max_epi16(work0, flat);
flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
@@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+
+ work_a = _mm_andnot_si128(flat, *q2);
+ *q2 = _mm_and_si128(flat, oq2);
+ *q2 = _mm_or_si128(work_a, *q2);
+
+ work_a = _mm_andnot_si128(flat, *p2);
+ *p2 = _mm_and_si128(flat, op2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
}
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
-
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
-
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
}
void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,