diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:15 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:20 -0500 |
commit | bbcc64772580c8a979288791afa02d30bc476d2e (patch) | |
tree | 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c | |
parent | 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff) | |
download | UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip |
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c')
-rw-r--r-- | third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c | 334 |
1 files changed, 2 insertions, 332 deletions
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c index 68461bc36..212d3bd72 100644 --- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c +++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c @@ -12,81 +12,14 @@ #include <assert.h> #include <smmintrin.h> -#include "./av1_rtcd.h" -#include "av1/common/filter.h" - -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER -DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]); -#endif - -#if USE_TEMPORALFILTER_12TAP -DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]); -#endif +#include "config/av1_rtcd.h" -typedef int16_t (*HbdSubpelFilterCoeffs)[8]; +#include "av1/common/filter.h" typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src, int src_stride, uint16_t *dst, int dst_stride, int bd); -static INLINE HbdSubpelFilterCoeffs -hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) { -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - if (p.interp_filter == MULTITAP_SHARP) { - return &subpel_filters_sharp[index][0]; - } -#endif -#if USE_TEMPORALFILTER_12TAP - if (p.interp_filter == TEMPORALFILTER_12TAP) { - return &subpel_temporalfilter[index][0]; - } -#endif - (void)p; - (void)index; - return NULL; -} - -static void init_simd_filter(const int16_t *filter_ptr, int taps, - int16_t (*simd_filter)[6][8]) { - int shift; - int offset = (12 - taps) / 2; - for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) { - const int16_t *filter_row = filter_ptr + shift * taps; - int i, j; - for (i = 0; i < 12; ++i) { - for (j = 0; j < 4; ++j) { - int r = i / 2; - int c = j * 2 + (i % 2); - if (i - offset >= 0 && i - offset < taps) - simd_filter[shift - 1][r][c] = filter_row[i - offset]; - else - simd_filter[shift - 1][r][c] = 0; - } - } - } -} - -void av1_highbd_convolve_init_sse4_1(void) { -#if USE_TEMPORALFILTER_12TAP - { - InterpFilterParams filter_params = - av1_get_interp_filter_params(TEMPORALFILTER_12TAP); - int taps = filter_params.taps; - const int16_t *filter_ptr = filter_params.filter_ptr; - init_simd_filter(filter_ptr, taps, subpel_temporalfilter); - } -#endif -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - { - InterpFilterParams filter_params = - av1_get_interp_filter_params(MULTITAP_SHARP); - int taps = filter_params.taps; - const int16_t *filter_ptr = filter_params.filter_ptr; - init_simd_filter(filter_ptr, taps, subpel_filters_sharp); - } -#endif -} - // pixelsNum 0: write all 4 pixels // 1/2/3: residual pixels 1/2/3 static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, @@ -218,138 +151,6 @@ void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src, writePixel(u, width, pixelsNum, dst, dst_stride); } -static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 }; - -static INLINE void transpose_pair(__m128i *in, __m128i *out) { - __m128i x0, x1; - - x0 = _mm_unpacklo_epi32(in[0], in[1]); - x1 = _mm_unpacklo_epi32(in[2], in[3]); - - out[0] = _mm_unpacklo_epi64(x0, x1); - out[1] = _mm_unpackhi_epi64(x0, x1); - - x0 = _mm_unpackhi_epi32(in[0], in[1]); - x1 = _mm_unpackhi_epi32(in[2], in[3]); - - out[2] = _mm_unpacklo_epi64(x0, x1); - out[3] = _mm_unpackhi_epi64(x0, x1); - - x0 = _mm_unpacklo_epi32(in[4], in[5]); - x1 = _mm_unpacklo_epi32(in[6], in[7]); - - out[4] = _mm_unpacklo_epi64(x0, x1); - out[5] = _mm_unpackhi_epi64(x0, x1); -} - -static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f, - int tapsNum, uint32_t *buf) { - __m128i u[8], v[6]; - - assert(tapsNum == 10 || tapsNum == 12); - if (tapsNum == 10) { - src -= 1; - } - - u[0] = _mm_loadu_si128((__m128i const *)src); - u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); - u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - - u[4] = _mm_loadu_si128((__m128i const *)(src + 8)); - u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8)); - u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8)); - u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8)); - - transpose_pair(u, v); - - u[0] = _mm_madd_epi16(v[0], f[0]); - u[1] = _mm_madd_epi16(v[1], f[1]); - u[2] = _mm_madd_epi16(v[2], f[2]); - u[3] = _mm_madd_epi16(v[3], f[3]); - u[4] = _mm_madd_epi16(v[4], f[4]); - u[5] = _mm_madd_epi16(v[5], f[5]); - - u[6] = _mm_min_epi32(u[2], u[3]); - u[7] = _mm_max_epi32(u[2], u[3]); - - u[0] = _mm_add_epi32(u[0], u[1]); - u[0] = _mm_add_epi32(u[0], u[5]); - u[0] = _mm_add_epi32(u[0], u[4]); - u[0] = _mm_add_epi32(u[0], u[6]); - u[0] = _mm_add_epi32(u[0], u[7]); - - _mm_storeu_si128((__m128i *)buf, u[0]); -} - -void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, - int avg, int bd) { - DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]); - __m128i verf[6]; - HbdSubpelFilterCoeffs vCoeffs; - const uint16_t *srcPtr; - const int tapsNum = filter_params.taps; - int i, col, count, blkResidu, blkHeight; - TransposeSave transSave = transSaveTab[avg]; - (void)x_step_q4; - - if (0 == subpel_x_q4 || 16 != x_step_q4) { - av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_x_q4, x_step_q4, avg, bd); - return; - } - - vCoeffs = - hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1); - if (!vCoeffs) { - av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_x_q4, x_step_q4, avg, bd); - return; - } - - verf[0] = *((const __m128i *)(vCoeffs)); - verf[1] = *((const __m128i *)(vCoeffs + 1)); - verf[2] = *((const __m128i *)(vCoeffs + 2)); - verf[3] = *((const __m128i *)(vCoeffs + 3)); - verf[4] = *((const __m128i *)(vCoeffs + 4)); - verf[5] = *((const __m128i *)(vCoeffs + 5)); - - src -= (tapsNum >> 1) - 1; - srcPtr = src; - - count = 0; - blkHeight = h >> 2; - blkResidu = h & 3; - - while (blkHeight != 0) { - for (col = 0; col < w; col += 4) { - for (i = 0; i < 4; ++i) { - highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); - srcPtr += 1; - } - transSave(w, 0, temp, 4, dst + col, dst_stride, bd); - } - count++; - srcPtr = src + count * src_stride * 4; - dst += dst_stride * 4; - blkHeight--; - } - - if (blkResidu == 0) return; - - for (col = 0; col < w; col += 4) { - for (i = 0; i < 4; ++i) { - highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); - srcPtr += 1; - } - transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd); - } -} - // Vertical convolutional filter typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst); @@ -402,134 +203,3 @@ static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { } WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum }; - -static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride, - const __m128i *f, int taps, - uint16_t *dst, WritePixels saveFunc, - int bd) { - __m128i s[12]; - __m128i zero = _mm_setzero_si128(); - int i = 0; - int r = 0; - - // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case - assert(taps == 10 || taps == 12); - if (10 == taps) { - i += 1; - s[0] = zero; - } - while (i < 12) { - s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride)); - i += 1; - r += 1; - } - - s[0] = _mm_unpacklo_epi16(s[0], s[1]); - s[2] = _mm_unpacklo_epi16(s[2], s[3]); - s[4] = _mm_unpacklo_epi16(s[4], s[5]); - s[6] = _mm_unpacklo_epi16(s[6], s[7]); - s[8] = _mm_unpacklo_epi16(s[8], s[9]); - s[10] = _mm_unpacklo_epi16(s[10], s[11]); - - s[0] = _mm_madd_epi16(s[0], f[0]); - s[2] = _mm_madd_epi16(s[2], f[1]); - s[4] = _mm_madd_epi16(s[4], f[2]); - s[6] = _mm_madd_epi16(s[6], f[3]); - s[8] = _mm_madd_epi16(s[8], f[4]); - s[10] = _mm_madd_epi16(s[10], f[5]); - - s[1] = _mm_min_epi32(s[4], s[6]); - s[3] = _mm_max_epi32(s[4], s[6]); - - s[0] = _mm_add_epi32(s[0], s[2]); - s[0] = _mm_add_epi32(s[0], s[10]); - s[0] = _mm_add_epi32(s[0], s[8]); - s[0] = _mm_add_epi32(s[0], s[1]); - s[0] = _mm_add_epi32(s[0], s[3]); - - saveFunc(s, bd, dst); -} - -static void highbd_filter_vert_compute_large(const uint16_t *src, - int src_stride, const __m128i *f, - int taps, int w, int h, - uint16_t *dst, int dst_stride, - int avg, int bd) { - int col; - int rowIndex = 0; - const uint16_t *src_ptr = src; - uint16_t *dst_ptr = dst; - const int step = 4; - WritePixels write4pixels = write4pixelsTab[avg]; - - do { - for (col = 0; col < w; col += step) { - filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr, - write4pixels, bd); - src_ptr += step; - dst_ptr += step; - } - rowIndex++; - src_ptr = src + rowIndex * src_stride; - dst_ptr = dst + rowIndex * dst_stride; - } while (rowIndex < h); -} - -static void highbd_filter_vert_compute_small(const uint16_t *src, - int src_stride, const __m128i *f, - int taps, int w, int h, - uint16_t *dst, int dst_stride, - int avg, int bd) { - int rowIndex = 0; - WritePixels write2pixels = write2pixelsTab[avg]; - (void)w; - - do { - filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd); - rowIndex++; - src += src_stride; - dst += dst_stride; - } while (rowIndex < h); -} - -void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, - int avg, int bd) { - __m128i verf[6]; - HbdSubpelFilterCoeffs vCoeffs; - const int tapsNum = filter_params.taps; - - if (0 == subpel_y_q4 || 16 != y_step_q4) { - av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_y_q4, y_step_q4, avg, bd); - return; - } - - vCoeffs = - hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1); - if (!vCoeffs) { - av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_y_q4, y_step_q4, avg, bd); - return; - } - - verf[0] = *((const __m128i *)(vCoeffs)); - verf[1] = *((const __m128i *)(vCoeffs + 1)); - verf[2] = *((const __m128i *)(vCoeffs + 2)); - verf[3] = *((const __m128i *)(vCoeffs + 3)); - verf[4] = *((const __m128i *)(vCoeffs + 4)); - verf[5] = *((const __m128i *)(vCoeffs + 5)); - - src -= src_stride * ((tapsNum >> 1) - 1); - - if (w > 2) { - highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst, - dst_stride, avg, bd); - } else { - highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst, - dst_stride, avg, bd); - } -} |