diff options
Diffstat (limited to 'third_party/aom/av1/common/x86/filterintra_sse4.c')
-rw-r--r-- | third_party/aom/av1/common/x86/filterintra_sse4.c | 931 |
1 files changed, 54 insertions, 877 deletions
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c index 4f77da446..c11edc1d4 100644 --- a/third_party/aom/av1/common/x86/filterintra_sse4.c +++ b/third_party/aom/av1/common/x86/filterintra_sse4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2018, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -11,888 +11,65 @@ #include <smmintrin.h> -#include "./av1_rtcd.h" -#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "av1/common/enums.h" #include "av1/common/reconintra.h" -#if USE_3TAP_INTRA_FILTER -void filterintra_sse4_3tap_dummy_func(void); -void filterintra_sse4_3tap_dummy_func(void) {} -#else - -static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left, - __m128i *sum) { - const __m128i a = _mm_loadu_si128((const __m128i *)above); - const __m128i l = _mm_loadu_si128((const __m128i *)left); - const __m128i zero = _mm_setzero_si128(); - - __m128i u0 = _mm_unpacklo_epi8(a, zero); - __m128i u1 = _mm_unpacklo_epi8(l, zero); - - sum[0] = _mm_add_epi16(u0, u1); -} - -static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint16_t sum_value; - - AddPixelsSmall(above, left, &sum_vector); - - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - - sum_value = _mm_extract_epi16(sum_vector, 0); - sum_value += 4; - sum_value >>= 3; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint16_t sum_value; - - AddPixelsSmall(above, left, &sum_vector); - - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - - sum_value = _mm_extract_epi16(sum_vector, 0); - sum_value += 8; - sum_value >>= 4; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left, - __m128i *sum) { - const __m128i a = _mm_loadu_si128((const __m128i *)above); - const __m128i l = _mm_loadu_si128((const __m128i *)left); - const __m128i zero = _mm_setzero_si128(); - - __m128i u0 = _mm_unpacklo_epi8(a, zero); - __m128i u1 = _mm_unpacklo_epi8(l, zero); - - sum[0] = _mm_add_epi16(u0, u1); - - u0 = _mm_unpackhi_epi8(a, zero); - u1 = _mm_unpackhi_epi8(l, zero); - - sum[0] = _mm_add_epi16(sum[0], u0); - sum[0] = _mm_add_epi16(sum[0], u1); -} - -static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint16_t sum_value; - - AddPixelsLarge(above, left, &sum_vector); - - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - - sum_value = _mm_extract_epi16(sum_vector, 0); - sum_value += 16; - sum_value >>= 5; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector[2], u; - uint16_t sum_value; - - AddPixelsLarge(above, left, &sum_vector[0]); - AddPixelsLarge(above + 16, left + 16, &sum_vector[1]); - - sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]); - sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values - sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values - - u = _mm_srli_si128(sum_vector[0], 2); - sum_vector[0] = _mm_add_epi16(sum_vector[0], u); - - sum_value = _mm_extract_epi16(sum_vector[0], 0); - sum_value += 32; - sum_value >>= 6; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -// Note: -// params[4] : mean value, 4 int32_t repetition -// -static INLINE int CalcRefPixelsMeanValue(const uint8_t *above, - const uint8_t *left, int bs, - __m128i *params) { - int meanValue = 0; - switch (bs) { - case 4: meanValue = GetMeanValue4x4(above, left, params); break; - case 8: meanValue = GetMeanValue8x8(above, left, params); break; - case 16: meanValue = GetMeanValue16x16(above, left, params); break; - case 32: meanValue = GetMeanValue32x32(above, left, params); break; - default: assert(0); - } - return meanValue; -} - -// Note: -// params[0-3] : 4-tap filter coefficients (int32_t per coefficient) -// -static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) { - const TX_SIZE tx_size = - (bs == 32) ? TX_32X32 - : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4))); - // c0 - params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0], - av1_filter_intra_taps_4[tx_size][mode][0], - av1_filter_intra_taps_4[tx_size][mode][0], - av1_filter_intra_taps_4[tx_size][mode][0]); - // c1 - params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1], - av1_filter_intra_taps_4[tx_size][mode][1], - av1_filter_intra_taps_4[tx_size][mode][1], - av1_filter_intra_taps_4[tx_size][mode][1]); - // c2 - params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2], - av1_filter_intra_taps_4[tx_size][mode][2], - av1_filter_intra_taps_4[tx_size][mode][2], - av1_filter_intra_taps_4[tx_size][mode][2]); - // c3 - params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3], - av1_filter_intra_taps_4[tx_size][mode][3], - av1_filter_intra_taps_4[tx_size][mode][3], - av1_filter_intra_taps_4[tx_size][mode][3]); -} - -static const int maxBlkSize = 32; - -static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst, - ptrdiff_t stride) { - const int predStride = (maxBlkSize << 1) + 1; - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride)); - __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride)); - __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride)); - - p0 = _mm_add_epi32(p0, mean[0]); - p1 = _mm_add_epi32(p1, mean[0]); - p2 = _mm_add_epi32(p2, mean[0]); - p3 = _mm_add_epi32(p3, mean[0]); - - p0 = _mm_packus_epi32(p0, p1); - p1 = _mm_packus_epi32(p2, p3); - p0 = _mm_packus_epi16(p0, p1); - - *((int *)dst) = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0); - p0 = _mm_srli_si128(p0, 4); - *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0); -} - -static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst, - ptrdiff_t stride) { - const int predStride = (maxBlkSize << 1) + 1; - __m128i p0, p1, p2, p3; - int r = 0; - - while (r < 8) { - p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); - p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); - r += 1; - p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); - p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); - - p0 = _mm_add_epi32(p0, mean[0]); - p1 = _mm_add_epi32(p1, mean[0]); - p2 = _mm_add_epi32(p2, mean[0]); - p3 = _mm_add_epi32(p3, mean[0]); - - p0 = _mm_packus_epi32(p0, p1); - p1 = _mm_packus_epi32(p2, p3); - p0 = _mm_packus_epi16(p0, p1); - - _mm_storel_epi64((__m128i *)dst, p0); - dst += stride; - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)dst, p0); - dst += stride; - r += 1; - } -} - -static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst, - ptrdiff_t stride) { - const int predStride = (maxBlkSize << 1) + 1; - __m128i p0, p1, p2, p3; - int r = 0; - - while (r < 16) { - p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); - p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); - p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8)); - p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12)); - - p0 = _mm_add_epi32(p0, mean[0]); - p1 = _mm_add_epi32(p1, mean[0]); - p2 = _mm_add_epi32(p2, mean[0]); - p3 = _mm_add_epi32(p3, mean[0]); - - p0 = _mm_packus_epi32(p0, p1); - p1 = _mm_packus_epi32(p2, p3); - p0 = _mm_packus_epi16(p0, p1); - - _mm_storel_epi64((__m128i *)dst, p0); - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)(dst + 8), p0); - dst += stride; - r += 1; - } -} - -static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst, - ptrdiff_t stride) { - const int predStride = (maxBlkSize << 1) + 1; - __m128i p0, p1, p2, p3, p4, p5, p6, p7; - int r = 0; - - while (r < 32) { - p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); - p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); - p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8)); - p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12)); - - p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16)); - p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20)); - p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24)); - p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28)); - - p0 = _mm_add_epi32(p0, mean[0]); - p1 = _mm_add_epi32(p1, mean[0]); - p2 = _mm_add_epi32(p2, mean[0]); - p3 = _mm_add_epi32(p3, mean[0]); - - p4 = _mm_add_epi32(p4, mean[0]); - p5 = _mm_add_epi32(p5, mean[0]); - p6 = _mm_add_epi32(p6, mean[0]); - p7 = _mm_add_epi32(p7, mean[0]); - - p0 = _mm_packus_epi32(p0, p1); - p1 = _mm_packus_epi32(p2, p3); - p0 = _mm_packus_epi16(p0, p1); - - p4 = _mm_packus_epi32(p4, p5); - p5 = _mm_packus_epi32(p6, p7); - p4 = _mm_packus_epi16(p4, p5); - - _mm_storel_epi64((__m128i *)dst, p0); - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)(dst + 8), p0); - - _mm_storel_epi64((__m128i *)(dst + 16), p4); - p4 = _mm_srli_si128(p4, 8); - _mm_storel_epi64((__m128i *)(dst + 24), p4); - - dst += stride; - r += 1; - } -} - -static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst, - ptrdiff_t stride) { - switch (bs) { - case 4: SavePred4x4(pred, mean, dst, stride); break; - case 8: SavePred8x8(pred, mean, dst, stride); break; - case 16: SavePred16x16(pred, mean, dst, stride); break; - case 32: SavePred32x32(pred, mean, dst, stride); break; - default: assert(0); - } -} - -typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred, - const int predStride); - -static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred, - const int predStride) { - __m128i u0, u1, u2; - int c0 = _mm_extract_epi32(prm[1], 0); - int x = *(pred + predStride); - int sum; - - u0 = _mm_mullo_epi32(p[0], prm[2]); - u1 = _mm_mullo_epi32(p[1], prm[0]); - u2 = _mm_mullo_epi32(p[2], prm[3]); - - u0 = _mm_add_epi32(u0, u1); - u0 = _mm_add_epi32(u0, u2); - - sum = _mm_extract_epi32(u0, 0); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 1) = x; - - sum = _mm_extract_epi32(u0, 1); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 2) = x; - - sum = _mm_extract_epi32(u0, 2); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 3) = x; - - sum = _mm_extract_epi32(u0, 3); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 4) = x; -} - -static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred, - const int predStride) { - __m128i u0, u1, u2; - int c0 = _mm_extract_epi32(prm[1], 0); - int x = *(pred + predStride); - int sum; - - u0 = _mm_mullo_epi32(p[0], prm[2]); - u1 = _mm_mullo_epi32(p[1], prm[0]); - u2 = _mm_mullo_epi32(p[2], prm[3]); - - u0 = _mm_add_epi32(u0, u1); - u0 = _mm_add_epi32(u0, u2); - - sum = _mm_extract_epi32(u0, 0); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 1) = x; - - sum = _mm_extract_epi32(u0, 1); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 2) = x; - - sum = _mm_extract_epi32(u0, 2); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 3) = x; -} - -static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred, - const int predStride) { - __m128i u0, u1, u2; - int c0 = _mm_extract_epi32(prm[1], 0); - int x = *(pred + predStride); - int sum; - - u0 = _mm_mullo_epi32(p[0], prm[2]); - u1 = _mm_mullo_epi32(p[1], prm[0]); - u2 = _mm_mullo_epi32(p[2], prm[3]); - - u0 = _mm_add_epi32(u0, u1); - u0 = _mm_add_epi32(u0, u2); - - sum = _mm_extract_epi32(u0, 0); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 1) = x; - - sum = _mm_extract_epi32(u0, 1); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 2) = x; -} - -static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred, - const int predStride) { - __m128i u0, u1, u2; - int c0 = _mm_extract_epi32(prm[1], 0); - int x = *(pred + predStride); - int sum; - - u0 = _mm_mullo_epi32(p[0], prm[2]); - u1 = _mm_mullo_epi32(p[1], prm[0]); - u2 = _mm_mullo_epi32(p[2], prm[3]); - - u0 = _mm_add_epi32(u0, u1); - u0 = _mm_add_epi32(u0, u2); - - sum = _mm_extract_epi32(u0, 0); - sum += c0 * x; - x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); - *(pred + predStride + 1) = x; -} - -static ProducePixelsFunc prodPixelsFuncTab[4] = { - ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels -}; - -static void ProducePixels(int *pred, const __m128i *prm, int remain) { - __m128i p[3]; - const int predStride = (maxBlkSize << 1) + 1; - int index; - - p[0] = _mm_loadu_si128((const __m128i *)pred); - p[1] = _mm_loadu_si128((const __m128i *)(pred + 1)); - p[2] = _mm_loadu_si128((const __m128i *)(pred + 2)); - - if (remain <= 2) { - return; - } - if (remain > 5) { - index = 3; - } else { - index = remain - 3; - } - prodPixelsFuncTab[index](p, prm, pred, predStride); -} - -// Note: -// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c -// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1 -static void GeneratePrediction(const uint8_t *above, const uint8_t *left, - const int bs, const __m128i *prm, int meanValue, - uint8_t *dst, ptrdiff_t stride) { - int pred[33][65]; - int r, c, colBound; - int remainings; - - for (r = 0; r < bs; ++r) { - pred[r + 1][0] = (int)left[r] - meanValue; - } - - above -= 1; - for (c = 0; c < 2 * bs + 1; ++c) { - pred[0][c] = (int)above[c] - meanValue; - } - - r = 0; - c = 0; - while (r < bs) { - colBound = (bs << 1) - r; - for (c = 0; c < colBound; c += 4) { - remainings = colBound - c + 1; - ProducePixels(&pred[r][c], prm, remainings); - } - r += 1; - } - - SavePrediction(&pred[1][1], &prm[4], bs, dst, stride); -} - -static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs, - __m128i *prm, uint8_t *dst, ptrdiff_t stride) { - int meanValue = 0; - meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]); - GeneratePrediction(above, left, bs, prm, meanValue, dst, stride); -} - -void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, DC_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, V_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, H_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D45_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D135_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D117_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D153_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D207_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, D63_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - __m128i prm[5]; - GetIntraFilterParams(bs, TM_PRED, &prm[0]); - FilterPrediction(above, left, bs, prm, dst, stride); -} - -// ============== High Bit Depth ============== -#if CONFIG_HIGHBITDEPTH -static INLINE int HighbdGetMeanValue4x4(const uint16_t *above, - const uint16_t *left, const int bd, - __m128i *params) { - const __m128i a = _mm_loadu_si128((const __m128i *)above); - const __m128i l = _mm_loadu_si128((const __m128i *)left); - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint16_t sum_value; - (void)bd; - - sum_vector = _mm_add_epi16(a, l); - - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - - sum_value = _mm_extract_epi16(sum_vector, 0); - sum_value += 4; - sum_value >>= 3; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -static INLINE int HighbdGetMeanValue8x8(const uint16_t *above, - const uint16_t *left, const int bd, - __m128i *params) { - const __m128i a = _mm_loadu_si128((const __m128i *)above); - const __m128i l = _mm_loadu_si128((const __m128i *)left); - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint16_t sum_value; - (void)bd; - - sum_vector = _mm_add_epi16(a, l); - - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - - sum_value = _mm_extract_epi16(sum_vector, 0); - sum_value += 8; - sum_value >>= 4; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -// Note: -// Process 16 pixels above and left, 10-bit depth -// Add to the last 8 pixels sum -static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left, - __m128i *sum) { - __m128i a = _mm_loadu_si128((const __m128i *)above); - __m128i l = _mm_loadu_si128((const __m128i *)left); - sum[0] = _mm_add_epi16(a, l); - a = _mm_loadu_si128((const __m128i *)(above + 8)); - l = _mm_loadu_si128((const __m128i *)(left + 8)); - sum[0] = _mm_add_epi16(sum[0], a); - sum[0] = _mm_add_epi16(sum[0], l); -} - -// Note: -// Process 16 pixels above and left, 12-bit depth -// Add to the last 8 pixels sum -static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left, - __m128i *sum) { - __m128i a = _mm_loadu_si128((const __m128i *)above); - __m128i l = _mm_loadu_si128((const __m128i *)left); - const __m128i zero = _mm_setzero_si128(); - __m128i v0, v1; - - v0 = _mm_unpacklo_epi16(a, zero); - v1 = _mm_unpacklo_epi16(l, zero); - sum[0] = _mm_add_epi32(v0, v1); - - v0 = _mm_unpackhi_epi16(a, zero); - v1 = _mm_unpackhi_epi16(l, zero); - sum[0] = _mm_add_epi32(sum[0], v0); - sum[0] = _mm_add_epi32(sum[0], v1); - - a = _mm_loadu_si128((const __m128i *)(above + 8)); - l = _mm_loadu_si128((const __m128i *)(left + 8)); - - v0 = _mm_unpacklo_epi16(a, zero); - v1 = _mm_unpacklo_epi16(l, zero); - sum[0] = _mm_add_epi32(sum[0], v0); - sum[0] = _mm_add_epi32(sum[0], v1); - - v0 = _mm_unpackhi_epi16(a, zero); - v1 = _mm_unpackhi_epi16(l, zero); - sum[0] = _mm_add_epi32(sum[0], v0); - sum[0] = _mm_add_epi32(sum[0], v1); -} - -static INLINE int HighbdGetMeanValue16x16(const uint16_t *above, - const uint16_t *left, const int bd, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector, u; - uint32_t sum_value = 0; - - if (10 == bd) { - AddPixels10bit(above, left, &sum_vector); - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values - sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values - - u = _mm_srli_si128(sum_vector, 2); - sum_vector = _mm_add_epi16(sum_vector, u); - sum_value = _mm_extract_epi16(sum_vector, 0); - } else if (12 == bd) { - AddPixels12bit(above, left, &sum_vector); - - sum_vector = _mm_hadd_epi32(sum_vector, zero); - u = _mm_srli_si128(sum_vector, 4); - sum_vector = _mm_add_epi32(u, sum_vector); - sum_value = _mm_extract_epi32(sum_vector, 0); - } - - sum_value += 16; - sum_value >>= 5; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -static INLINE int HighbdGetMeanValue32x32(const uint16_t *above, - const uint16_t *left, const int bd, - __m128i *params) { - const __m128i zero = _mm_setzero_si128(); - __m128i sum_vector[2], u; - uint32_t sum_value = 0; - - if (10 == bd) { - AddPixels10bit(above, left, &sum_vector[0]); - AddPixels10bit(above + 16, left + 16, &sum_vector[1]); - - sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]); - sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values - sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values - - u = _mm_srli_si128(sum_vector[0], 2); - sum_vector[0] = _mm_add_epi16(sum_vector[0], u); - sum_value = _mm_extract_epi16(sum_vector[0], 0); - } else if (12 == bd) { - AddPixels12bit(above, left, &sum_vector[0]); - AddPixels12bit(above + 16, left + 16, &sum_vector[1]); - - sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]); - sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero); - u = _mm_srli_si128(sum_vector[0], 4); - sum_vector[0] = _mm_add_epi32(u, sum_vector[0]); - sum_value = _mm_extract_epi32(sum_vector[0], 0); - } - - sum_value += 32; - sum_value >>= 6; - *params = _mm_set1_epi32(sum_value); - return sum_value; -} - -// Note: -// params[4] : mean value, 4 int32_t repetition -// -static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above, - const uint16_t *left, int bs, - const int bd, __m128i *params) { - int meanValue = 0; - switch (bs) { - case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break; - case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break; - case 16: - meanValue = HighbdGetMeanValue16x16(above, left, bd, params); - break; - case 32: - meanValue = HighbdGetMeanValue32x32(above, left, bd, params); - break; - default: assert(0); - } - return meanValue; -} - -// Note: -// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c -// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1 -static void HighbdGeneratePrediction(const uint16_t *above, - const uint16_t *left, const int bs, - const int bd, const __m128i *prm, - int meanValue, uint16_t *dst, - ptrdiff_t stride) { - int pred[33][65]; - int r, c, colBound; - int remainings; - int ipred; - - for (r = 0; r < bs; ++r) { - pred[r + 1][0] = (int)left[r] - meanValue; - } - - above -= 1; - for (c = 0; c < 2 * bs + 1; ++c) { - pred[0][c] = (int)above[c] - meanValue; - } - - r = 0; - c = 0; - while (r < bs) { - colBound = (bs << 1) - r; - for (c = 0; c < colBound; c += 4) { - remainings = colBound - c + 1; - ProducePixels(&pred[r][c], prm, remainings); +void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + int r, c; + uint8_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + // The initialization is just for silencing Jenkins static analysis warnings + for (r = 0; r < bh + 1; ++r) + memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0])); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + + const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]); + const __m128i filter_intra_scale_bits = + _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS)); + + for (r = 1; r < bh + 1; r += 2) { + for (c = 1; c < bw + 1; c += 4) { + DECLARE_ALIGNED(16, uint8_t, p[8]); + memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t)); + p[5] = buffer[r][c - 1]; + p[6] = buffer[r + 1][c - 1]; + p[7] = 0; + const __m128i p_b = xx_loadl_64(p); + const __m128i in = _mm_unpacklo_epi64(p_b, p_b); + const __m128i out_01 = _mm_maddubs_epi16(in, f1f0); + const __m128i out_23 = _mm_maddubs_epi16(in, f3f2); + const __m128i out_45 = _mm_maddubs_epi16(in, f5f4); + const __m128i out_67 = _mm_maddubs_epi16(in, f7f6); + const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23); + const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67); + const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567); + // Rounding + const __m128i round_w = + _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits); + const __m128i out_r = _mm_packus_epi16(round_w, round_w); + const __m128i out_r1 = _mm_srli_si128(out_r, 4); + // Storing + xx_storel_32(&buffer[r][c], out_r); + xx_storel_32(&buffer[r + 1][c], out_r1); } - r += 1; } - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - ipred = pred[r + 1][c + 1] + meanValue; - dst[c] = clip_pixel_highbd(ipred, bd); - } + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); dst += stride; } } - -static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left, - int bs, const int bd, __m128i *prm, - uint16_t *dst, ptrdiff_t stride) { - int meanValue = 0; - meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]); - HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride); -} - -void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, DC_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, V_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, H_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D45_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D135_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D117_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D153_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D207_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, D63_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} - -void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - __m128i prm[5]; - GetIntraFilterParams(bs, TM_PRED, &prm[0]); - HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); -} -#endif // CONFIG_HIGHBITDEPTH - -#endif // USE_3TAP_INTRA_FILTER |