diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/intrapred_sse2.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/intrapred_sse2.c | 870 |
1 files changed, 808 insertions, 62 deletions
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c index 2a83b9001..5b2452c8e 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -11,11 +11,11 @@ #include <emmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" -static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { - int i; - for (i = 0; i < 4; ++i) { +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { *(uint32_t *)dst = dc; dst += stride; *(uint32_t *)dst = dc; @@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, } } +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + static INLINE __m128i dc_sum_4(const uint8_t *ref) { __m128i x = _mm_loadl_epi64((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); @@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) { return _mm_add_epi16(x0, high); } +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + // ----------------------------------------------------------------------------- // DC_PRED @@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); const uint32_t pred = _mm_cvtsi128_si32(row); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 4, dst, stride); @@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); @@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 8, dst, stride); } @@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32(above); @@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_TOP @@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_above = _mm_packus_epi16(sum_above, sum_above); const uint32_t pred = _mm_cvtsi128_si32(sum_above); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; @@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_LEFT @@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_left = _mm_packus_epi16(sum_left, sum_left); const uint32_t pred = _mm_cvtsi128_si32(sum_left); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -333,6 +683,62 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_128 @@ -341,7 +747,15 @@ void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, (void)above; (void)left; const uint32_t pred = 0x80808080; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // V_PRED @@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint32_t pred = *(uint32_t *)above; (void)left; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); } void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); @@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } -void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { const __m128i row0 = _mm_load_si128((__m128i const *)above); const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { (void)left; - int i; - for (i = 0; i < 16; ++i) { + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, row0); _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); dst += stride; } } +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + // ----------------------------------------------------------------------------- // H_PRED @@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, *(uint32_t *)dst = _mm_cvtsi128_si32(row3); } -void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i left_col = _mm_loadl_epi64((__m128i const *)left); - left_col = _mm_unpacklo_epi8(left_col, left_col); - __m128i row0 = _mm_shufflelo_epi16(left_col, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_load_si128((__m128i const *)left); @@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); @@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_low, 0x55); row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); @@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); @@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, _mm_storel_epi64((__m128i *)dst, row3); } +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, ptrdiff_t stride) { int i; @@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, h_pred_store_16xh(row, 4, dst, stride); } +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, h_prediction_16x8_2(&left_col_8p, dst, stride); } -void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i left_col, left_col_8p; - (void)above; +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { int i = 0; - do { - left_col = _mm_load_si128((const __m128i *)left); - left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); dst += stride << 2; - left_col_8p = _mm_unpackhi_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); dst += stride << 2; left += 16; i++; - } while (i < 2); + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); } static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, @@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, h_pred_store_32xh(row, 4, dst, stride); } +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i left_col, left_col_8p; @@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); } + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} |