summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c')
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c274
1 files changed, 1 insertions, 273 deletions
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
index 691e166cf..5a55736c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -11,7 +11,7 @@
#include <emmintrin.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
// -----------------------------------------------------------------------------
// H_PRED
@@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
dst += stride;
}
}
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
- const __m128i *z) {
- const __m128i one = _mm_set1_epi16(1);
- const __m128i a = _mm_avg_epu16(*x, *z);
- const __m128i b =
- _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
- return _mm_avg_epu16(b, *y);
-}
-
-void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
- const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
- const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
- const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
- const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
- const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
- const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
- const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
- const __m128i row0 = _mm_srli_si128(avg2, 6);
- const __m128i row1 = _mm_srli_si128(avg3, 4);
- const __m128i row2 = _mm_srli_si128(avg2, 4);
- const __m128i row3 = _mm_srli_si128(avg3, 2);
- (void)bd;
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
-
- dst -= stride;
- dst[0] = _mm_extract_epi16(avg3, 1);
- dst[stride] = _mm_extract_epi16(avg3, 0);
-}
-
-void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
- const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
- const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
- const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
- const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
- const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
- const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
- const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
- const __m128i row0 = _mm_srli_si128(avg3, 6);
- const __m128i row1 = _mm_srli_si128(avg3, 4);
- const __m128i row2 = _mm_srli_si128(avg3, 2);
- const __m128i row3 = avg3;
- (void)bd;
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
- const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
- const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
- const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
- const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
- const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
- const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
- const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
- const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
- const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
- const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
- const __m128i row2 = _mm_srli_si128(row3, 4);
- const __m128i row1 = _mm_srli_si128(row3, 8);
- const __m128i row0 = _mm_srli_si128(avg3, 4);
- (void)bd;
- _mm_storel_epi64((__m128i *)dst, row0);
- dst[0] = _mm_extract_epi16(avg2, 3);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
- const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
- __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
- CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
- const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
- (void)left;
- (void)bd;
- _mm_storel_epi64((__m128i *)dst, avg3);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-}
-
-void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- __m128i h76543210 = _mm_load_si128((const __m128i *)above);
- __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
- __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
- __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
- __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
- __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
- _mm_storel_epi64((__m128i *)dst, avg3);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
- dst += stride;
-
- // hcba98765
- h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
- h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
- // hxcba9876
- hx7654321 = _mm_srli_si128(h76543210, 2);
- // hxxcba987
- hx8765432 = _mm_srli_si128(h76543210, 4);
- avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
- _mm_storel_epi64((__m128i *)dst, avg3);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-}
-
-void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- __m128i x0 = _mm_load_si128((const __m128i *)above);
- __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
- __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
- __m128i y = avg3_epu16(&x0, &x1, &x2);
- _mm_store_si128((__m128i *)dst, y);
- dst += stride;
-
- x0 = _mm_loadu_si128((const __m128i *)(above + 3));
- y = avg3_epu16(&x1, &x2, &x0);
- _mm_store_si128((__m128i *)dst, y);
- dst += stride;
-
- x1 = _mm_loadu_si128((const __m128i *)(above + 4));
- y = avg3_epu16(&x2, &x0, &x1);
- _mm_store_si128((__m128i *)dst, y);
- dst += stride;
-
- x2 = _mm_loadu_si128((const __m128i *)(above + 5));
- x2 = _mm_insert_epi16(x2, above[11], 7);
- y = avg3_epu16(&x0, &x1, &x2);
- _mm_store_si128((__m128i *)dst, y);
-}
-
-static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
- const __m128i *a2, uint16_t **dst,
- ptrdiff_t stride) {
- const __m128i y = avg3_epu16(a0, a1, a2);
- _mm_storeu_si128((__m128i *)*dst, y);
- *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- __m128i x0 = _mm_load_si128((const __m128i *)above);
- __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
- __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
- d45e_w8(&x0, &x1, &x2, &dst, stride);
-
- int i = 3;
- do {
- x0 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x1, &x2, &x0, &dst, stride);
-
- x1 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x2, &x0, &x1, &dst, stride);
-
- x2 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x0, &x1, &x2, &dst, stride);
- } while (i < 9);
-
- x0 = _mm_loadu_si128((const __m128i *)(above + 9));
- x0 = _mm_insert_epi16(x0, above[15], 7);
- const __m128i y = avg3_epu16(&x1, &x2, &x0);
- _mm_store_si128((__m128i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- __m128i x0 = _mm_load_si128((const __m128i *)above);
- __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
- __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
- d45e_w8(&x0, &x1, &x2, &dst, stride);
-
- int i = 3;
- do {
- x0 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x1, &x2, &x0, &dst, stride);
-
- x1 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x2, &x0, &x1, &dst, stride);
-
- x2 = _mm_loadu_si128((const __m128i *)(above + i++));
- d45e_w8(&x0, &x1, &x2, &dst, stride);
- } while (i < 15);
-
- x0 = _mm_loadu_si128((const __m128i *)(above + 15));
- __m128i y = avg3_epu16(&x1, &x2, &x0);
- _mm_store_si128((__m128i *)dst, y);
- dst += stride;
-
- x1 = _mm_loadu_si128((const __m128i *)(above + 16));
- y = avg3_epu16(&x2, &x0, &x1);
- _mm_store_si128((__m128i *)dst, y);
- dst += stride;
-
- x2 = _mm_loadu_si128((const __m128i *)(above + 17));
- x2 = _mm_insert_epi16(x2, above[23], 7);
- y = avg3_epu16(&x0, &x1, &x2);
- _mm_store_si128((__m128i *)dst, y);
-}