diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c | 274 |
1 files changed, 1 insertions, 273 deletions
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c index 691e166cf..5a55736c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -11,7 +11,7 @@ #include <emmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- // H_PRED @@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, dst += stride; } } - -// ----------------------------------------------------------------------------- -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, - const __m128i *z) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a = _mm_avg_epu16(*x, *z); - const __m128i b = - _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); - return _mm_avg_epu16(b, *y); -} - -void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); - const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); - const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); - const __m128i row0 = _mm_srli_si128(avg2, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg2, 4); - const __m128i row3 = _mm_srli_si128(avg3, 2); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - - dst -= stride; - dst[0] = _mm_extract_epi16(avg3, 1); - dst[stride] = _mm_extract_epi16(avg3, 0); -} - -void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); - const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); - const __m128i row0 = _mm_srli_si128(avg3, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg3, 2); - const __m128i row3 = avg3; - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); - const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); - const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); - const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); - const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); - const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); - const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); - const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); - const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); - const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); - const __m128i row2 = _mm_srli_si128(row3, 4); - const __m128i row1 = _mm_srli_si128(row3, 8); - const __m128i row0 = _mm_srli_si128(avg3, 4); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst[0] = _mm_extract_epi16(avg2, 3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); - const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); - __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); - CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); - const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); - (void)left; - (void)bd; - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); -} - -void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i h76543210 = _mm_load_si128((const __m128i *)above); - __m128i hx7654321 = _mm_srli_si128(h76543210, 2); - __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); - __m128i hx8765432 = _mm_srli_si128(h87654321, 2); - __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); - __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); - dst += stride; - - // hcba98765 - h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); - h76543210 = _mm_insert_epi16(h76543210, above[11], 7); - // hxcba9876 - hx7654321 = _mm_srli_si128(h76543210, 2); - // hxxcba987 - hx8765432 = _mm_srli_si128(h76543210, 4); - avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); -} - -void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - __m128i y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x0 = _mm_loadu_si128((const __m128i *)(above + 3)); - y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 4)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 5)); - x2 = _mm_insert_epi16(x2, above[11], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} - -static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, - const __m128i *a2, uint16_t **dst, - ptrdiff_t stride) { - const __m128i y = avg3_epu16(a0, a1, a2); - _mm_storeu_si128((__m128i *)*dst, y); - *dst += stride; -} - -void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 9); - - x0 = _mm_loadu_si128((const __m128i *)(above + 9)); - x0 = _mm_insert_epi16(x0, above[15], 7); - const __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); -} - -void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 15); - - x0 = _mm_loadu_si128((const __m128i *)(above + 15)); - __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 16)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 17)); - x2 = _mm_insert_epi16(x2, above[23], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} |