diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:15 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:20 -0500 |
commit | bbcc64772580c8a979288791afa02d30bc476d2e (patch) | |
tree | 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/aom_dsp/x86 | |
parent | 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff) | |
download | UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip |
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
99 files changed, 9493 insertions, 26260 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index 4067b0b53..401fbdc48 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -9,8 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" #if HAVE_SSE2 @@ -20,12 +21,6 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2; filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d16_h8_avg_sse2; -filter8_1dfunction aom_filter_block1d8_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d8_h8_avg_sse2; -filter8_1dfunction aom_filter_block1d4_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d4_h8_avg_sse2; filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; @@ -33,12 +28,6 @@ filter8_1dfunction aom_filter_block1d8_v2_sse2; filter8_1dfunction aom_filter_block1d8_h2_sse2; filter8_1dfunction aom_filter_block1d4_v2_sse2; filter8_1dfunction aom_filter_block1d4_h2_sse2; -filter8_1dfunction aom_filter_block1d16_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d16_h2_avg_sse2; -filter8_1dfunction aom_filter_block1d8_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d8_h2_avg_sse2; -filter8_1dfunction aom_filter_block1d4_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; // void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -50,47 +39,16 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); - -// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); -#if CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#if ARCH_X86_64 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; @@ -98,12 +56,6 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, // ptrdiff_t src_stride, @@ -123,60 +75,8 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - sse2); - -// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#if CONFIG_LOOP_RESTORATION -// The SSE2 highbd convolve functions can deal with coefficients up to 32767. -// So redirect highbd_convolve8_add_src to regular highbd_convolve8. -void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - ((int16_t *)filter_x)[3] += 128; - ((int16_t *)filter_y)[3] += 128; - aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); - ((int16_t *)filter_x)[3] -= 128; - ((int16_t *)filter_y)[3] -= 128; -} -#endif // CONFIG_LOOP_RESTORATION -#endif // CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#endif // ARCH_X86_64 #endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm index 4d3142867..7283c32b8 100644 --- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -50,7 +50,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ cmp r4d, 32 je .w32 -%if CONFIG_AV1 && CONFIG_EXT_PARTITION cmp r4d, 64 je .w64 %ifidn %2, highbd @@ -160,50 +159,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ jnz .loop128 RET -%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION - -%ifidn %2, highbd - cmp r4d, 64 - je .w64 - - mov r4d, dword hm -.loop128: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - movu m0, [srcq+64] - movu m1, [srcq+80] - movu m2, [srcq+96] - movu m3, [srcq+112] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq+64] - pavg m1, [dstq+80] - pavg m2, [dstq+96] - pavg m3, [dstq+112] -%endif - mova [dstq+64], m0 - mova [dstq+80], m1 - mova [dstq+96], m2 - mova [dstq+112], m3 - add dstq, dst_strideq - sub r4d, 1 - jnz .loop128 - RET -%endif -%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION - .w64: mov r4d, dword hm .loop64: @@ -339,7 +294,4 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ INIT_XMM sse2 convolve_fn copy convolve_fn avg -%if CONFIG_HIGHBITDEPTH convolve_fn copy, highbd -convolve_fn avg, highbd -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c deleted file mode 100644 index 14352895d..000000000 --- a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> -#include <assert.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const int bd = 8; - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; - int intermediate_height = h + SUBPEL_TAPS - 1; - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + - (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16(_mm_max_epi16(res, zero), - _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1)); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storel_epi64(p, res_8bit); - } - } - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm index e6d357ba3..b6f040791 100644 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -200,6 +200,8 @@ movdqu [rdi + %2], xmm0 %endm +SECTION .text + ;void aom_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, @@ -392,169 +394,6 @@ sym(aom_highbd_filter_block1d16_v8_sse2): pop rbp ret -global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movq xmm0, [rsi] ;load src: row 0 - movq xmm1, [rsi + rax] ;1 - movq xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2] ;7 - movq xmm2, [rsi + rax] ;2 - movq xmm3, [rsi + rax * 2] ;3 - movq xmm4, [rsi + rdx] ;4 - movq xmm5, [rsi + rax * 4] ;5 - - HIGH_APPLY_FILTER_4 1 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 1, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 1, 0 - sub rsi, rax - - LOAD_VERT_8 16 - HIGH_APPLY_FILTER_8 1, 16 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void aom_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, @@ -772,194 +611,3 @@ sym(aom_highbd_filter_block1d16_h8_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm4, [rsi + 2] - movdqa xmm1, xmm0 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm4 - - psrldq xmm1, 2 - psrldq xmm6, 4 - psrldq xmm7, 6 - psrldq xmm2, 4 - psrldq xmm3, 6 - psrldq xmm5, 2 - - HIGH_APPLY_FILTER_4 1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 1, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 1, 0 - - movdqu xmm0, [rsi + 10] ;load src - movdqu xmm1, [rsi + 12] - movdqu xmm2, [rsi + 14] - movdqu xmm3, [rsi + 16] - movdqu xmm4, [rsi + 18] - movdqu xmm5, [rsi + 20] - movdqu xmm6, [rsi + 22] - movdqu xmm7, [rsi + 24] - - HIGH_APPLY_FILTER_8 1, 16 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm index 9e2ec748c..7b3fe6419 100644 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -174,6 +174,8 @@ %endm %endif +SECTION .text + global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE sym(aom_highbd_filter_block1d4_v2_sse2): push rbp @@ -254,86 +256,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2): ret %endif -global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movq xmm0, [rsi] ;load src - movq xmm1, [rsi + 2*rax] - - HIGH_APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + 2*rax] ;1 - - HIGH_APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + 2*rax] ;1 - movdqu xmm2, [rsi + 16] - movdqu xmm3, [rsi + 2*rax + 16] - - HIGH_APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif - global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE sym(aom_highbd_filter_block1d4_h2_sse2): push rbp @@ -414,84 +336,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2): pop rbp ret %endif - -global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 2 - - HIGH_APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - - HIGH_APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - movdqu xmm2, [rsi + 16] - movdqu xmm3, [rsi + 18] - - HIGH_APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c deleted file mode 100644 index 74ce80e50..000000000 --- a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> -#include <assert.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -#if EXTRAPREC_BITS > 2 -#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2" -#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)" -#endif - -void aom_highbd_convolve8_add_src_hip_ssse3( - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); - - uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; - int intermediate_height = h + SUBPEL_TAPS - 1; - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + - (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i data2 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(data, coeff_01); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Filter odd-index pixels - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1); - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); - - const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); - __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storeu_si128(p, res_16bit); - } - } - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index 61476b8be..af45a03ac 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -11,31 +11,12 @@ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" #include "aom_ports/mem.h" -// filters for 16_h8 and 16_v8 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - #if defined(__clang__) #if (__clang_major__ > 0 && __clang_major__ < 3) || \ (__clang_major__ == 3 && __clang_minor__ <= 3) || \ @@ -566,10 +547,4 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3; FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); -// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index be37738df..6bcb4a512 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -11,7 +11,8 @@ #include <tmmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve.h" #include "aom_mem/aom_mem.h" @@ -285,20 +286,6 @@ filter8_1dfunction aom_filter_block1d8_v8_ssse3; filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; -filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3; -#if CONFIG_LOOP_RESTORATION -filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3; -#endif filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; @@ -306,12 +293,6 @@ filter8_1dfunction aom_filter_block1d8_v2_ssse3; filter8_1dfunction aom_filter_block1d8_h2_ssse3; filter8_1dfunction aom_filter_block1d4_v2_ssse3; filter8_1dfunction aom_filter_block1d4_h2_ssse3; -filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -323,598 +304,5 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -#if CONFIG_LOOP_RESTORATION -FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_, - ssse3); -FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v, - src - src_stride * 3, add_src_, ssse3); -#endif - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ - } - -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)dst, temp); -} - -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; - - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i *)dst, A); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); -} - -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - // This function processes 8x8 areas. The intermediate height is not always - // a multiple of 8, so force it to be a multiple of 8 here. - y = h + (8 - (h & 0x7)); - - do { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 8) { - // process 8 src_x steps - for (z = 0; z < 8; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); - } else { - int i; - for (i = 0; i < 8; ++i) { - temp[z * 8 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 8x8 filters values back to dst - transpose8x8_to_dst(temp, 8, dst + x, dst_stride); - } - - src += src_stride * 8; - dst += dst_stride * 8; - } while (y -= 8); -} - -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); - // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); - - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); -} - -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - for (y = 0; y < h; y += 4) { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 4) { - // process 4 src_x steps - for (z = 0; z < 4; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); - } else { - int i; - for (i = 0; i < 4; ++i) { - temp[z * 4 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 4x4 filters values back to dst - transpose4x4_to_dst(temp, 4, dst + x, dst_stride); - } - - src += src_stride * 4; - dst += dst_stride * 4; - } -} - -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - - if (y_q4 & SUBPEL_MASK) { - filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - - y_q4 += y_step_q4; - } -} - -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)dst, temp); -} - -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - int i; - - for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); - // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); - - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; - // save 16 bytes convolve result - _mm_store_si128((__m128i *)&dst[i], temp_hi); - } -} - -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, - w); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - // --Require an additional 8 rows for the horiz_w8 transpose tail. - DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]); - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - if (w >= 8) { - scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height); - } else { - scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height); - } - - if (w >= 16) { - scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } else if (w == 8) { - scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } else { - scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } -} - -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - -// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_, ssse3); -#if CONFIG_LOOP_RESTORATION -FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3); -#endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm index b946010d3..c88fc9ffb 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -179,6 +179,8 @@ movq [rdi + %2], xmm0 %endm +SECTION .text + ;void aom_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, @@ -368,166 +370,6 @@ sym(aom_filter_block1d16_v8_sse2): pop rbp ret -global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d4_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 1 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d8_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d16_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 1, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void aom_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, @@ -771,220 +613,3 @@ sym(aom_filter_block1d16_h8_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d4_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d8_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d16_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 8688fb544..3ca7921b6 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -375,17 +375,8 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ INIT_XMM ssse3 SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg - -%if CONFIG_LOOP_RESTORATION -SUBPIX_HFILTER16 h8_add_src -SUBPIX_HFILTER8 h8_add_src -SUBPIX_HFILTER4 h8_add_src -%endif ;------------------------------------------------------------------------------- @@ -875,15 +866,5 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ INIT_XMM ssse3 SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 - -%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \ - CONFIG_LOOP_RESTORATION -SUBPIX_VFILTER16 v8_add_src -SUBPIX_VFILTER v8_add_src, 8 -SUBPIX_VFILTER v8_add_src, 4 -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm index 8f025a8be..d0b4b2839 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -134,6 +134,8 @@ dec rcx %endm +SECTION .text + global sym(aom_filter_block1d4_v2_sse2) PRIVATE sym(aom_filter_block1d4_v2_sse2): push rbp @@ -212,84 +214,6 @@ sym(aom_filter_block1d16_v2_sse2): pop rbp ret -global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d4_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d8_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d16_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - global sym(aom_filter_block1d4_h2_sse2) PRIVATE sym(aom_filter_block1d4_h2_sse2): push rbp @@ -369,83 +293,3 @@ sym(aom_filter_block1d16_h2_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d4_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d8_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d16_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm index b9b2da0be..59edc49a9 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -108,6 +108,8 @@ dec rcx %endm +SECTION .text + global sym(aom_filter_block1d4_v2_ssse3) PRIVATE sym(aom_filter_block1d4_v2_ssse3): push rbp @@ -185,83 +187,6 @@ sym(aom_filter_block1d16_v2_ssse3): pop rbp ret -global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d4_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d8_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d16_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - global sym(aom_filter_block1d4_h2_ssse3) PRIVATE sym(aom_filter_block1d4_h2_ssse3): push rbp @@ -340,82 +265,3 @@ sym(aom_filter_block1d16_h2_ssse3): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d4_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d8_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d16_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c deleted file mode 100644 index 1a6457402..000000000 --- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "aom_dsp/x86/synonyms.h" - -#include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" - -void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; - u0 = _mm_setzero_si128(); - // Row 0 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff0 = _mm_max_epi16(diff, negdiff); - // Row 1 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(absdiff0, absdiff); - minabsdiff = _mm_min_epi16(absdiff0, absdiff); - // Row 2 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 3 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 4 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 5 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 6 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 7 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); - *max = _mm_extract_epi16(maxabsdiff, 0); - - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); - *min = _mm_extract_epi16(minabsdiff, 0); -} - -static void hadamard_col8_sse2(__m128i *in, int iter) { - __m128i a0 = in[0]; - __m128i a1 = in[1]; - __m128i a2 = in[2]; - __m128i a3 = in[3]; - __m128i a4 = in[4]; - __m128i a5 = in[5]; - __m128i a6 = in[6]; - __m128i a7 = in[7]; - - __m128i b0 = _mm_add_epi16(a0, a1); - __m128i b1 = _mm_sub_epi16(a0, a1); - __m128i b2 = _mm_add_epi16(a2, a3); - __m128i b3 = _mm_sub_epi16(a2, a3); - __m128i b4 = _mm_add_epi16(a4, a5); - __m128i b5 = _mm_sub_epi16(a4, a5); - __m128i b6 = _mm_add_epi16(a6, a7); - __m128i b7 = _mm_sub_epi16(a6, a7); - - a0 = _mm_add_epi16(b0, b2); - a1 = _mm_add_epi16(b1, b3); - a2 = _mm_sub_epi16(b0, b2); - a3 = _mm_sub_epi16(b1, b3); - a4 = _mm_add_epi16(b4, b6); - a5 = _mm_add_epi16(b5, b7); - a6 = _mm_sub_epi16(b4, b6); - a7 = _mm_sub_epi16(b5, b7); - - if (iter == 0) { - b0 = _mm_add_epi16(a0, a4); - b7 = _mm_add_epi16(a1, a5); - b3 = _mm_add_epi16(a2, a6); - b4 = _mm_add_epi16(a3, a7); - b2 = _mm_sub_epi16(a0, a4); - b6 = _mm_sub_epi16(a1, a5); - b1 = _mm_sub_epi16(a2, a6); - b5 = _mm_sub_epi16(a3, a7); - - a0 = _mm_unpacklo_epi16(b0, b1); - a1 = _mm_unpacklo_epi16(b2, b3); - a2 = _mm_unpackhi_epi16(b0, b1); - a3 = _mm_unpackhi_epi16(b2, b3); - a4 = _mm_unpacklo_epi16(b4, b5); - a5 = _mm_unpacklo_epi16(b6, b7); - a6 = _mm_unpackhi_epi16(b4, b5); - a7 = _mm_unpackhi_epi16(b6, b7); - - b0 = _mm_unpacklo_epi32(a0, a1); - b1 = _mm_unpacklo_epi32(a4, a5); - b2 = _mm_unpackhi_epi32(a0, a1); - b3 = _mm_unpackhi_epi32(a4, a5); - b4 = _mm_unpacklo_epi32(a2, a3); - b5 = _mm_unpacklo_epi32(a6, a7); - b6 = _mm_unpackhi_epi32(a2, a3); - b7 = _mm_unpackhi_epi32(a6, a7); - - in[0] = _mm_unpacklo_epi64(b0, b1); - in[1] = _mm_unpackhi_epi64(b0, b1); - in[2] = _mm_unpacklo_epi64(b2, b3); - in[3] = _mm_unpackhi_epi64(b2, b3); - in[4] = _mm_unpacklo_epi64(b4, b5); - in[5] = _mm_unpackhi_epi64(b4, b5); - in[6] = _mm_unpacklo_epi64(b6, b7); - in[7] = _mm_unpackhi_epi64(b6, b7); - } else { - in[0] = _mm_add_epi16(a0, a4); - in[7] = _mm_add_epi16(a1, a5); - in[3] = _mm_add_epi16(a2, a6); - in[4] = _mm_add_epi16(a3, a7); - in[2] = _mm_sub_epi16(a0, a4); - in[6] = _mm_sub_epi16(a1, a5); - in[1] = _mm_sub_epi16(a2, a6); - in[5] = _mm_sub_epi16(a3, a7); - } -} - -void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - __m128i src[8]; - src[0] = _mm_load_si128((const __m128i *)src_diff); - src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - - hadamard_col8_sse2(src, 0); - hadamard_col8_sse2(src, 1); - - _mm_store_si128((__m128i *)coeff, src[0]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); -} - -void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = - src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); - } - - for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); - - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); - - b0 = _mm_srai_epi16(b0, 1); - b1 = _mm_srai_epi16(b1, 1); - b2 = _mm_srai_epi16(b2, 1); - b3 = _mm_srai_epi16(b3, 1); - - coeff0 = _mm_add_epi16(b0, b2); - coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); - - coeff2 = _mm_sub_epi16(b0, b2); - coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); - - coeff += 8; - } -} - -int aom_satd_sse2(const int16_t *coeff, int length) { - int i; - const __m128i zero = _mm_setzero_si128(); - __m128i accum = zero; - - for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); - const __m128i inv = _mm_sub_epi16(zero, src_line); - const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) - const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); - const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); - const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); - accum = _mm_add_epi32(accum, sum); - coeff += 8; - } - - { // cascading summation of accum - __m128i hi = _mm_srli_si128(accum, 8); - accum = _mm_add_epi32(accum, hi); - hi = _mm_srli_epi64(accum, 32); - accum = _mm_add_epi32(accum, hi); - } - - return _mm_cvtsi128_si32(accum); -} - -void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride, - int height) { - int idx; - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_loadu_si128((const __m128i *)ref); - __m128i s0 = _mm_unpacklo_epi8(src_line, zero); - __m128i s1 = _mm_unpackhi_epi8(src_line, zero); - __m128i t0, t1; - int height_1 = height - 1; - ref += ref_stride; - - for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - } - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - - if (height == 64) { - s0 = _mm_srai_epi16(s0, 5); - s1 = _mm_srai_epi16(s1, 5); - } else if (height == 32) { - s0 = _mm_srai_epi16(s0, 4); - s1 = _mm_srai_epi16(s1, 4); - } else { - s0 = _mm_srai_epi16(s0, 3); - s1 = _mm_srai_epi16(s1, 3); - } - - _mm_storeu_si128((__m128i *)hbuf, s0); - hbuf += 8; - _mm_storeu_si128((__m128i *)hbuf, s1); -} - -int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) { - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); - __m128i s0 = _mm_sad_epu8(src_line, zero); - __m128i s1; - int i; - - for (i = 16; i < width; i += 16) { - ref += 16; - src_line = _mm_load_si128((const __m128i *)ref); - s1 = _mm_sad_epu8(src_line, zero); - s0 = _mm_adds_epu16(s0, s1); - } - - s1 = _mm_srli_si128(s0, 8); - s0 = _mm_adds_epu16(s0, s1); - - return _mm_extract_epi16(s0, 0); -} - -int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) { - int idx; - int width = 4 << bwl; - int16_t mean; - __m128i v0 = _mm_loadu_si128((const __m128i *)ref); - __m128i v1 = _mm_load_si128((const __m128i *)src); - __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sum = diff; - __m128i sse = _mm_madd_epi16(diff, diff); - - ref += 8; - src += 8; - - for (idx = 8; idx < width; idx += 8) { - v0 = _mm_loadu_si128((const __m128i *)ref); - v1 = _mm_load_si128((const __m128i *)src); - diff = _mm_subs_epi16(v0, v1); - - sum = _mm_add_epi16(sum, diff); - v0 = _mm_madd_epi16(diff, diff); - sse = _mm_add_epi32(sse, v0); - - ref += 8; - src += 8; - } - - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, v0); - - v1 = _mm_srli_si128(sse, 8); - sse = _mm_add_epi32(sse, v1); - v1 = _mm_srli_epi64(sse, 32); - sse = _mm_add_epi32(sse, v1); - - mean = _mm_extract_epi16(sum, 0); - - return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); -} diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm deleted file mode 100644 index b2d150296..000000000 --- a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm +++ /dev/null @@ -1,124 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%define private_prefix aom - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the hadamard transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - -SECTION .text - -%if ARCH_X86_64 -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro HMD8_1D 0 - psubw m8, m0, m1 - psubw m9, m2, m3 - paddw m0, m1 - paddw m2, m3 - SWAP 1, 8 - SWAP 3, 9 - psubw m8, m4, m5 - psubw m9, m6, m7 - paddw m4, m5 - paddw m6, m7 - SWAP 5, 8 - SWAP 7, 9 - - psubw m8, m0, m2 - psubw m9, m1, m3 - paddw m0, m2 - paddw m1, m3 - SWAP 2, 8 - SWAP 3, 9 - psubw m8, m4, m6 - psubw m9, m5, m7 - paddw m4, m6 - paddw m5, m7 - SWAP 6, 8 - SWAP 7, 9 - - psubw m8, m0, m4 - psubw m9, m1, m5 - paddw m0, m4 - paddw m1, m5 - SWAP 4, 8 - SWAP 5, 9 - psubw m8, m2, m6 - psubw m9, m3, m7 - paddw m2, m6 - paddw m3, m7 - SWAP 6, 8 - SWAP 7, 9 -%endmacro - -INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output - lea r3, [2 * strideq] - lea r4, [4 * strideq] - - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - HMD8_1D - - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 - - RET -%endif diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c index e916e4ff9..4f5e3f8c1 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -11,7 +11,7 @@ #include "aom/aom_integer.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" // To start out, just dispatch to the function using the 2D mask and // pass mask stride as 0. This can be improved upon if necessary. @@ -19,18 +19,16 @@ void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, 0, h, w, 0, 0); + src1_stride, mask, 0, w, h, 0, 0); } -#if CONFIG_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, - src1_8, src1_stride, mask, 0, h, w, 0, 0, + src1_8, src1_stride, mask, 0, w, h, 0, 0, bd); } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c index 68d74e517..49c20b467 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -21,7 +21,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // No sub-sampling @@ -31,7 +31,7 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -58,7 +58,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -84,7 +84,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -119,7 +119,7 @@ static void blend_a64_mask_w16n_sse4_1( static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -149,7 +149,7 @@ static void blend_a64_mask_sx_w4_sse4_1( static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -179,7 +179,7 @@ static void blend_a64_mask_sx_w8_sse4_1( static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -219,7 +219,7 @@ static void blend_a64_mask_sx_w16n_sse4_1( static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -248,7 +248,7 @@ static void blend_a64_mask_sy_w4_sse4_1( static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -277,7 +277,7 @@ static void blend_a64_mask_sy_w8_sse4_1( static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zero = _mm_setzero_si128(); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -315,7 +315,7 @@ static void blend_a64_mask_sy_w16n_sse4_1( static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -350,7 +350,7 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -385,7 +385,7 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( static void blend_a64_mask_sx_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -435,12 +435,12 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - int w, int suby, int subx) { + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { typedef void (*blend_fn)( uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w); + const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: width_index X subx X suby static const blend_fn blend[3][2][2] = { @@ -465,15 +465,14 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, h, w, suby, subx); + mask, mask_stride, w, h, subx, suby); } else { blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, h, w); + mask, mask_stride, w, h); } } -#if CONFIG_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -503,7 +502,7 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1( static void blend_a64_mask_b10_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); @@ -512,7 +511,7 @@ static void blend_a64_mask_b10_w4_sse4_1( static void blend_a64_mask_b12_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); @@ -521,7 +520,7 @@ static void blend_a64_mask_b12_w4_sse4_1( static INLINE void blend_a64_mask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -546,18 +545,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1( static void blend_a64_mask_b10_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -594,7 +593,7 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( static void blend_a64_mask_b10_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -604,7 +603,7 @@ static void blend_a64_mask_b10_sx_w4_sse4_1( static void blend_a64_mask_b12_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -614,7 +613,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1( static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); @@ -643,18 +642,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( static void blend_a64_mask_b10_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -690,7 +689,7 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( static void blend_a64_mask_b10_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -700,7 +699,7 @@ static void blend_a64_mask_b10_sy_w4_sse4_1( static void blend_a64_mask_b12_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -710,7 +709,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1( static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -738,18 +737,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( static void blend_a64_mask_b10_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -791,7 +790,7 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( static void blend_a64_mask_b10_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -801,7 +800,7 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1( static void blend_a64_mask_b12_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -811,7 +810,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1( static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); @@ -845,18 +844,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -869,12 +868,12 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, - uint32_t mask_stride, int h, int w, - int suby, int subx, int bd) { + uint32_t mask_stride, int w, int h, + int subx, int suby, int bd) { typedef void (*blend_fn)( uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w); + const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: bd_index X width_index X subx X suby static const blend_fn blend[2][2][2][2] = { @@ -909,8 +908,8 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, mask_stride, h, w, suby, - subx, bd); + src1_stride, mask, mask_stride, w, h, subx, + suby, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -918,7 +917,113 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w); + mask_stride, w, h); + } +} + +static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0, + const CONV_BUF_TYPE *src1, + const __m128i *m, + const __m128i *v_round_offset, + const __m128i *v_maxval, int round_bits) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS); + const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset); + const __m128i res_d = xx_roundn_epi32(res_c, round_bits); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +void aom_lowbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_ro_a = xx_loadl_32(&round_offset); + const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0); + const __m128i one_w = _mm_set1_epi16(1); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + + if (subw == 0 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else if (subw == 1 && subh == 1) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = + xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]); + const __m128i m_i1 = + xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b); + const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else if (subw == 1 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); + const __m128i m = _mm_srli_epi16(m_ac_1, 1); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]); + const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]); + const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1); + const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); + const __m128i m = _mm_srli_epi16(m_ac_1, 1); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c index 9dabe5b79..59506bdfe 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -21,7 +21,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling @@ -30,7 +30,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -55,7 +55,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -82,7 +82,7 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -112,11 +112,11 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); + const uint8_t *mask, int w, int h); // Dimension: width_index static const blend_fn blend[9] = { @@ -139,11 +139,10 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, - w); + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, + h); } -#if CONFIG_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -174,7 +173,7 @@ static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b10); @@ -185,7 +184,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b12); @@ -194,7 +193,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, static INLINE void blend_a64_vmask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w, blend_unit_fn blend) { + const uint8_t *mask, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -218,9 +217,9 @@ static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, blend_8_b10); + src1_stride, mask, w, h, blend_8_b10); } static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, @@ -228,9 +227,9 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, blend_8_b12); + src1_stride, mask, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// @@ -240,11 +239,11 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, void aom_highbd_blend_a64_vmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); + const uint8_t *mask, int w, int h); // Dimensions are: bd_index X width_index static const blend_fn blend[2][2] = { @@ -272,14 +271,13 @@ void aom_highbd_blend_a64_vmask_sse4_1( if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, h, w, bd); + src1_stride, mask, w, h, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w); + src1_stride, mask, w, h); } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h index daa2b2b3a..4880438bc 100644 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -53,7 +53,6 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, return v_res_w; } -#if CONFIG_HIGHBITDEPTH typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); @@ -141,6 +140,5 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, return v_res_w; } -#endif // CONFIG_HIGHBITDEPTH #endif // AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h index 5f9596a74..3f46420dd 100644 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -14,7 +14,7 @@ #include <immintrin.h> -#include "./aom_config.h" +#include "config/aom_config.h" // Note: in and out could have the same value static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h index 8641164db..36fb1963a 100644 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -13,7 +13,8 @@ #include <assert.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_convolve.h" @@ -84,102 +85,6 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, } \ } -#define FUN_CONV_2D(avg, opt) \ - void aom_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ - assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ - filter_y[1] || filter_y[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ - MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h + 7); \ - aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ - dst, dst_stride, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ - aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h + 1); \ - aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ - } - -#if CONFIG_LOOP_RESTORATION -// convolve_add_src is only used by the Wiener filter, which will never -// end up calling the bilinear functions (it uses a symmetric filter, so -// the possible numbers of taps are 1,3,5,7) -#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ - opt) \ - void aom_convolve8_##name##_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - (void)filter_x; \ - (void)x_step_q4; \ - (void)filter_y; \ - (void)y_step_q4; \ - assert((-128 <= filter[3]) && (filter[3] <= 127)); \ - assert(step_q4 == 16); \ - while (w >= 16) { \ - aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - if (w) { \ - aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ - } - -#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ - void aom_convolve8_##type##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ - assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - aom_convolve8_##htype##horiz_##opt( \ - src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 7); \ - aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ - dst, dst_stride, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h); \ - } -#endif - -#if CONFIG_HIGHBITDEPTH typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, const ptrdiff_t src_pitch, uint16_t *output_ptr, @@ -248,41 +153,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ - void aom_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, \ - fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), \ - MAX_SB_SIZE, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 7, bd); \ - aom_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ - dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, \ - fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ - aom_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - aom_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } \ - } else { \ - aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } -#endif // CONFIG_HIGHBITDEPTH - #endif // AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..7790baf2e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_DSP_X86_CONVOLVE_AVX2_H_ + +// filters for 16 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +static INLINE void prepare_coeffs_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16(0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m256i convolve_lowbd(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), + _mm256_add_epi16(res_23, res_67)); + + return res; +} + +static INLINE __m256i convolve(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m256i convolve_lowbd_x(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + s[3] = _mm256_shuffle_epi8(data, filt[3]); + + return convolve_lowbd(s, coeffs); +} + +static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, + const __m256i *const res, + const int do_average) { + __m256i d; + if (do_average) { + d = _mm256_load_si256((__m256i *)dst); + d = _mm256_add_epi32(d, *res); + d = _mm256_srai_epi32(d, 1); + } else { + d = *res; + } + _mm256_store_si256((__m256i *)dst, d); +} + +static INLINE __m256i comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); + const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); + + const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm256_packs_epi32(res_lo, res_hi); + } else { + const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, + const __m256i *const offset_const, + const __m256i *const round_const, + const int round_shift) { + const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi16( + _mm256_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt0, + const __m256i *const wt1, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); + const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); + const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); + res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi32(wt_res, 1); + } + return res; +} + +static INLINE __m256i highbd_convolve_rounding( + const __m256i *const res_unsigned, const __m256i *const offset_const, + const __m256i *const round_const, const int round_shift) { + const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi32( + _mm256_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h new file mode 100644 index 000000000..e80c5872f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(d, *res); + d = _mm_srai_epi32(d, 1); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h new file mode 100644 index 000000000..846fe7bb4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_DSP_X86_CONVOLVE_SSE2_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeff = _mm_loadu_si128((__m128i *)filter); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); + + const __m128i res = + _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i comp_avg(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); + const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); + + const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm_packs_epi32(res_lo, res_hi); + } else { + const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); + res = _mm_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, + const __m128i *const offset_const, + const __m128i *const round_const, + const int round_shift) { + const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m128i highbd_convolve_rounding_sse2( + const __m128i *const res_unsigned, const __m128i *const offset_const, + const __m128i *const round_const, const int round_shift) { + const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h new file mode 100644 index 000000000..d48c25667 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ +#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, + const __m128i *const res, + const __m128i *const wt0, + const __m128i *const wt1, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); + d = _mm_srai_epi32(d, DIST_PRECISION_BITS); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt0, + const __m128i *const wt1, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); + const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); + + const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); + res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); + res = _mm_srai_epi32(wt_res, 1); + } + return res; +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c new file mode 100644 index 000000000..54da02253 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_avx2.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +extern void aom_transpose_float_sse2(const float *A, float *B, int n); +extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, + int n); + +// Generate the 1d forward transforms for float using _mm256 +GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +// Generate the 1d inverse transforms for float using _mm256 +GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, + aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); +} + +void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_avx2, aom_ifft1d_16_avx2, + aom_transpose_float_sse2, 8); +} + +void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_avx2, aom_ifft1d_32_avx2, + aom_transpose_float_sse2, 8); +} diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c new file mode 100644 index 000000000..12bdc3e18 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_sse2.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the +s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <xmmintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void transpose4x4(const float *A, float *B, const int lda, + const int ldb) { + __m128 row1 = _mm_load_ps(&A[0 * lda]); + __m128 row2 = _mm_load_ps(&A[1 * lda]); + __m128 row3 = _mm_load_ps(&A[2 * lda]); + __m128 row4 = _mm_load_ps(&A[3 * lda]); + _MM_TRANSPOSE4_PS(row1, row2, row3, row4); + _mm_store_ps(&B[0 * ldb], row1); + _mm_store_ps(&B[1 * ldb], row2); + _mm_store_ps(&B[2 * ldb], row3); + _mm_store_ps(&B[3 * ldb], row4); +} + +void aom_transpose_float_sse2(const float *A, float *B, int n) { + for (int y = 0; y < n; y += 4) { + for (int x = 0; x < n; x += 4) { + transpose4x4(A + y * n + x, B + x * n + y, n, n); + } + } +} + +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { + const int n2 = n / 2; + output[0] = packed[0]; + output[1] = 0; + output[2 * (n2 * n)] = packed[n2 * n]; + output[2 * (n2 * n) + 1] = 0; + + output[2 * n2] = packed[n2]; + output[2 * n2 + 1] = 0; + output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; + output[2 * (n2 * n + n2) + 1] = 0; + + for (int c = 1; c < n2; ++c) { + output[2 * (0 * n + c)] = packed[c]; + output[2 * (0 * n + c) + 1] = packed[c + n2]; + output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; + output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; + } + for (int r = 1; r < n2; ++r) { + output[2 * (r * n + 0)] = packed[r * n]; + output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; + output[2 * (r * n + n2) + 0] = packed[r * n + n2]; + output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; + + for (int c = 1; c < AOMMIN(n2, 4); ++c) { + output[2 * (r * n + c)] = + packed[r * n + c] - packed[(r + n2) * n + c + n2]; + output[2 * (r * n + c) + 1] = + packed[(r + n2) * n + c] + packed[r * n + c + n2]; + } + + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r * n + c); + __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); + real1 = _mm_sub_ps(real1, real2); + imag1 = _mm_add_ps(imag1, imag2); + _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); + } + + int r2 = r + n2; + int r3 = n - r2; + output[2 * (r2 * n + 0)] = packed[r3 * n]; + output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; + output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; + output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; + for (int c = 1; c < AOMMIN(4, n2); ++c) { + output[2 * (r2 * n + c)] = + packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; + output[2 * (r2 * n + c) + 1] = + -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; + } + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r3 * n + c); + __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); + real1 = _mm_add_ps(real1, real2); + imag1 = _mm_sub_ps(imag2, imag1); + _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r2 * n + c + 2), + _mm_unpackhi_ps(real1, imag1)); + } + } +} + +// Generate definitions for 1d transforms using float and __mm128 +GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +// Generate definitions for 1d inverse transforms using float and mm128 +GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, + aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, + aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_sse2, aom_ifft1d_16_sse2, + aom_transpose_float_sse2, 4); +} + +void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_sse2, aom_ifft1d_32_sse2, + aom_transpose_float_sse2, 4); +} diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c deleted file mode 100644 index b8ec08de7..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c +++ /dev/null @@ -1,862 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "aom_dsp/fwd_txfm.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// Apply a 32-element IDCT to 8 columns. This does not do any transposition -// of its output - the caller is expected to do that. -// The input buffers are the top and bottom halves of an 8x32 block. -void fdct32_8col(__m128i *in0, __m128i *in1) { - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - { - const __m128i *ina = in0; - const __m128i *inb = in1 + 15; - __m128i *step1a = &step1[0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 4; - const __m128i *inb = in1 + 11; - __m128i *step1a = &step1[4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 8; - const __m128i *inb = in1 + 7; - __m128i *step1a = &step1[8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 12; - const __m128i *inb = in1 + 3; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - // Stage 2 - { - step2[0] = _mm_add_epi16(step1[0], step1[15]); - step2[1] = _mm_add_epi16(step1[1], step1[14]); - step2[2] = _mm_add_epi16(step1[2], step1[13]); - step2[3] = _mm_add_epi16(step1[3], step1[12]); - step2[4] = _mm_add_epi16(step1[4], step1[11]); - step2[5] = _mm_add_epi16(step1[5], step1[10]); - step2[6] = _mm_add_epi16(step1[6], step1[9]); - step2[7] = _mm_add_epi16(step1[7], step1[8]); - step2[8] = _mm_sub_epi16(step1[7], step1[8]); - step2[9] = _mm_sub_epi16(step1[6], step1[9]); - step2[10] = _mm_sub_epi16(step1[5], step1[10]); - step2[11] = _mm_sub_epi16(step1[4], step1[11]); - step2[12] = _mm_sub_epi16(step1[3], step1[12]); - step2[13] = _mm_sub_epi16(step1[2], step1[13]); - step2[14] = _mm_sub_epi16(step1[1], step1[14]); - step2[15] = _mm_sub_epi16(step1[0], step1[15]); - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); - } - // Stage 3 - { - step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm_add_epi16(step2[23], step1[16]); - step3[17] = _mm_add_epi16(step2[22], step1[17]); - step3[18] = _mm_add_epi16(step2[21], step1[18]); - step3[19] = _mm_add_epi16(step2[20], step1[19]); - step3[20] = _mm_sub_epi16(step1[19], step2[20]); - step3[21] = _mm_sub_epi16(step1[18], step2[21]); - step3[22] = _mm_sub_epi16(step1[17], step2[22]); - step3[23] = _mm_sub_epi16(step1[16], step2[23]); - step3[24] = _mm_sub_epi16(step1[31], step2[24]); - step3[25] = _mm_sub_epi16(step1[30], step2[25]); - step3[26] = _mm_sub_epi16(step1[29], step2[26]); - step3[27] = _mm_sub_epi16(step1[28], step2[27]); - step3[28] = _mm_add_epi16(step2[27], step1[28]); - step3[29] = _mm_add_epi16(step2[26], step1[29]); - step3[30] = _mm_add_epi16(step2[25], step1[30]); - step3[31] = _mm_add_epi16(step2[24], step1[31]); - } - - // Stage 4 - { - step1[0] = _mm_add_epi16(step3[3], step3[0]); - step1[1] = _mm_add_epi16(step3[2], step3[1]); - step1[2] = _mm_sub_epi16(step3[1], step3[2]); - step1[3] = _mm_sub_epi16(step3[0], step3[3]); - step1[8] = _mm_add_epi16(step3[11], step2[8]); - step1[9] = _mm_add_epi16(step3[10], step2[9]); - step1[10] = _mm_sub_epi16(step2[9], step3[10]); - step1[11] = _mm_sub_epi16(step2[8], step3[11]); - step1[12] = _mm_sub_epi16(step2[15], step3[12]); - step1[13] = _mm_sub_epi16(step2[14], step3[13]); - step1[14] = _mm_add_epi16(step3[13], step2[14]); - step1[15] = _mm_add_epi16(step3[12], step2[15]); - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm_add_epi16(step1[5], step3[4]); - step2[5] = _mm_sub_epi16(step3[4], step1[5]); - step2[6] = _mm_sub_epi16(step3[7], step1[6]); - step2[7] = _mm_add_epi16(step1[6], step3[7]); - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm_add_epi16(step1[19], step3[16]); - step2[17] = _mm_add_epi16(step1[18], step3[17]); - step2[18] = _mm_sub_epi16(step3[17], step1[18]); - step2[19] = _mm_sub_epi16(step3[16], step1[19]); - step2[20] = _mm_sub_epi16(step3[23], step1[20]); - step2[21] = _mm_sub_epi16(step3[22], step1[21]); - step2[22] = _mm_add_epi16(step1[21], step3[22]); - step2[23] = _mm_add_epi16(step1[20], step3[23]); - step2[24] = _mm_add_epi16(step1[27], step3[24]); - step2[25] = _mm_add_epi16(step1[26], step3[25]); - step2[26] = _mm_sub_epi16(step3[25], step1[26]); - step2[27] = _mm_sub_epi16(step3[24], step1[27]); - step2[28] = _mm_sub_epi16(step3[31], step1[28]); - step2[29] = _mm_sub_epi16(step3[30], step1[29]); - step2[30] = _mm_add_epi16(step1[29], step3[30]); - step2[31] = _mm_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); - } - { - step3[8] = _mm_add_epi16(step2[9], step1[8]); - step3[9] = _mm_sub_epi16(step1[8], step2[9]); - step3[10] = _mm_sub_epi16(step1[11], step2[10]); - step3[11] = _mm_add_epi16(step2[10], step1[11]); - step3[12] = _mm_add_epi16(step2[13], step1[12]); - step3[13] = _mm_sub_epi16(step1[12], step2[13]); - step3[14] = _mm_sub_epi16(step1[15], step2[14]); - step3[15] = _mm_add_epi16(step2[14], step1[15]); - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm_add_epi16(step3[17], step2[16]); - step1[17] = _mm_sub_epi16(step2[16], step3[17]); - step1[18] = _mm_sub_epi16(step2[19], step3[18]); - step1[19] = _mm_add_epi16(step3[18], step2[19]); - step1[20] = _mm_add_epi16(step3[21], step2[20]); - step1[21] = _mm_sub_epi16(step2[20], step3[21]); - step1[22] = _mm_sub_epi16(step2[23], step3[22]); - step1[23] = _mm_add_epi16(step3[22], step2[23]); - step1[24] = _mm_add_epi16(step3[25], step2[24]); - step1[25] = _mm_sub_epi16(step2[24], step3[25]); - step1[26] = _mm_sub_epi16(step2[27], step3[26]); - step1[27] = _mm_add_epi16(step3[26], step2[27]); - step1[28] = _mm_add_epi16(step3[29], step2[28]); - step1[29] = _mm_sub_epi16(step2[28], step3[29]); - step1[30] = _mm_sub_epi16(step2[31], step3[30]); - step1[31] = _mm_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); - } - - // Output results - { - int j; - for (j = 0; j < 16; ++j) { - _mm_storeu_si128((__m128i *)(in0 + j), out[j]); - _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]); - } - } -} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h deleted file mode 100644 index 216739581..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h +++ /dev/null @@ -1,3022 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> // AVX2 - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_intrin.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -#if FDCT32x32_HIGH_PRECISION -static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { - __m256i buf0, buf1; - buf0 = _mm256_mul_epu32(a, b); - a = _mm256_srli_epi64(a, 32); - b = _mm256_srli_epi64(b, 32); - buf1 = _mm256_mul_epu32(a, b); - return _mm256_add_epi64(buf0, buf1); -} - -static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { - __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); - __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); - return _mm256_unpacklo_epi64(buf0, buf1); -} -#endif - -#ifndef STORE_COEFF_FUNC -#define STORE_COEFF_FUNC -static void store_coeff(const __m256i *coeff, tran_low_t *curr, - tran_low_t *next) { - __m128i u = _mm256_castsi256_si128(*coeff); - storeu_output(&u, curr); - u = _mm256_extractf128_si256(*coeff, 1); - storeu_output(&u, next); -} -#endif - -void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org, - int stride) { - // Calculate pre-multiplied strides - const int str1 = stride; - const int str2 = 2 * stride; - const int str3 = 2 * stride + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i k__cospi_p16_m16 = - pair256_set_epi16(+cospi_16_64, -cospi_16_64); - const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); - const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); - const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); - const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i k__cospi_m12_m20 = - pair256_set_epi16(-cospi_12_64, -cospi_20_64); - const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); - const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); - const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); - const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); - const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); - const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); - const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); - const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); - const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); - const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); - const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); - const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); - const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); - const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); - const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); - const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); - const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); - const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); - const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); - const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); - const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); - const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); - const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); - const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); - const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); - const __m256i kZero = _mm256_set1_epi16(0); - const __m256i kOne = _mm256_set1_epi16(1); - // Do the two transform/transpose passes - int pass; - for (pass = 0; pass < 2; ++pass) { - // We process sixteen columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 16) { - __m256i step1[32]; - __m256i step2[32]; - __m256i step3[32]; - __m256i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - const int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m256i *step1a = &step1[0]; - __m256i *step1b = &step1[31]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m256i *step1a = &step1[4]; - __m256i *step1b = &step1[27]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m256i *step1a = &step1[8]; - __m256i *step1b = &step1[23]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; - __m256i *step1a = &step1[12]; - __m256i *step1b = &step1[19]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); - __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); - __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); - __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); - __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); - __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); - __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); - __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); - step1[0] = _mm256_add_epi16(in00, in31); - step1[1] = _mm256_add_epi16(in01, in30); - step1[2] = _mm256_add_epi16(in02, in29); - step1[3] = _mm256_add_epi16(in03, in28); - step1[28] = _mm256_sub_epi16(in03, in28); - step1[29] = _mm256_sub_epi16(in02, in29); - step1[30] = _mm256_sub_epi16(in01, in30); - step1[31] = _mm256_sub_epi16(in00, in31); - } - { - __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); - __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); - __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); - __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); - __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); - __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); - __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); - __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); - step1[4] = _mm256_add_epi16(in04, in27); - step1[5] = _mm256_add_epi16(in05, in26); - step1[6] = _mm256_add_epi16(in06, in25); - step1[7] = _mm256_add_epi16(in07, in24); - step1[24] = _mm256_sub_epi16(in07, in24); - step1[25] = _mm256_sub_epi16(in06, in25); - step1[26] = _mm256_sub_epi16(in05, in26); - step1[27] = _mm256_sub_epi16(in04, in27); - } - { - __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); - __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); - __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); - __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); - __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); - __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); - __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); - __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); - step1[8] = _mm256_add_epi16(in08, in23); - step1[9] = _mm256_add_epi16(in09, in22); - step1[10] = _mm256_add_epi16(in10, in21); - step1[11] = _mm256_add_epi16(in11, in20); - step1[20] = _mm256_sub_epi16(in11, in20); - step1[21] = _mm256_sub_epi16(in10, in21); - step1[22] = _mm256_sub_epi16(in09, in22); - step1[23] = _mm256_sub_epi16(in08, in23); - } - { - __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); - __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); - __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); - __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); - __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); - __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); - __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); - __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); - step1[12] = _mm256_add_epi16(in12, in19); - step1[13] = _mm256_add_epi16(in13, in18); - step1[14] = _mm256_add_epi16(in14, in17); - step1[15] = _mm256_add_epi16(in15, in16); - step1[16] = _mm256_sub_epi16(in15, in16); - step1[17] = _mm256_sub_epi16(in14, in17); - step1[18] = _mm256_sub_epi16(in13, in18); - step1[19] = _mm256_sub_epi16(in12, in19); - } - } - // Stage 2 - { - step2[0] = _mm256_add_epi16(step1[0], step1[15]); - step2[1] = _mm256_add_epi16(step1[1], step1[14]); - step2[2] = _mm256_add_epi16(step1[2], step1[13]); - step2[3] = _mm256_add_epi16(step1[3], step1[12]); - step2[4] = _mm256_add_epi16(step1[4], step1[11]); - step2[5] = _mm256_add_epi16(step1[5], step1[10]); - step2[6] = _mm256_add_epi16(step1[6], step1[9]); - step2[7] = _mm256_add_epi16(step1[7], step1[8]); - step2[8] = _mm256_sub_epi16(step1[7], step1[8]); - step2[9] = _mm256_sub_epi16(step1[6], step1[9]); - step2[10] = _mm256_sub_epi16(step1[5], step1[10]); - step2[11] = _mm256_sub_epi16(step1[4], step1[11]); - step2[12] = _mm256_sub_epi16(step1[3], step1[12]); - step2[13] = _mm256_sub_epi16(step1[2], step1[13]); - step2[14] = _mm256_sub_epi16(step1[1], step1[14]); - step2[15] = _mm256_sub_epi16(step1[0], step1[15]); - } - { - const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); - const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); - const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); - const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); - const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); - const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); - const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); - const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); - const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s2_20_4 = - _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m256i s2_20_5 = - _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m256i s2_21_4 = - _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m256i s2_21_5 = - _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m256i s2_22_4 = - _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m256i s2_22_5 = - _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m256i s2_23_4 = - _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m256i s2_23_5 = - _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m256i s2_24_4 = - _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m256i s2_24_5 = - _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m256i s2_25_4 = - _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m256i s2_25_5 = - _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m256i s2_26_4 = - _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m256i s2_26_5 = - _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m256i s2_27_4 = - _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m256i s2_27_5 = - _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); - } - -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); - __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); - __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); - __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); - __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); - __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); - __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); - __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); - __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); - __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); - __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); - __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); - __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); - __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); - __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); - __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); - __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); - __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); - __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); - __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); - __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); - __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); - __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); - __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); - __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); - __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); - __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); - __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); - __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); - __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); - __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); - __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); - - step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); - step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); - step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); - step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); - step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); - step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); - step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); - step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); - step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); - step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); - step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); - step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); - step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); - step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); - step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); - step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); - step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); - step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); - step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); - step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); - step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); - step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); - step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); - step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); - step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); - step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); - step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); - step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); - step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); - step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); - step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); - step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); - - step2[0] = _mm256_add_epi16(step2[0], kOne); - step2[1] = _mm256_add_epi16(step2[1], kOne); - step2[2] = _mm256_add_epi16(step2[2], kOne); - step2[3] = _mm256_add_epi16(step2[3], kOne); - step2[4] = _mm256_add_epi16(step2[4], kOne); - step2[5] = _mm256_add_epi16(step2[5], kOne); - step2[6] = _mm256_add_epi16(step2[6], kOne); - step2[7] = _mm256_add_epi16(step2[7], kOne); - step2[8] = _mm256_add_epi16(step2[8], kOne); - step2[9] = _mm256_add_epi16(step2[9], kOne); - step2[10] = _mm256_add_epi16(step2[10], kOne); - step2[11] = _mm256_add_epi16(step2[11], kOne); - step2[12] = _mm256_add_epi16(step2[12], kOne); - step2[13] = _mm256_add_epi16(step2[13], kOne); - step2[14] = _mm256_add_epi16(step2[14], kOne); - step2[15] = _mm256_add_epi16(step2[15], kOne); - step1[16] = _mm256_add_epi16(step1[16], kOne); - step1[17] = _mm256_add_epi16(step1[17], kOne); - step1[18] = _mm256_add_epi16(step1[18], kOne); - step1[19] = _mm256_add_epi16(step1[19], kOne); - step2[20] = _mm256_add_epi16(step2[20], kOne); - step2[21] = _mm256_add_epi16(step2[21], kOne); - step2[22] = _mm256_add_epi16(step2[22], kOne); - step2[23] = _mm256_add_epi16(step2[23], kOne); - step2[24] = _mm256_add_epi16(step2[24], kOne); - step2[25] = _mm256_add_epi16(step2[25], kOne); - step2[26] = _mm256_add_epi16(step2[26], kOne); - step2[27] = _mm256_add_epi16(step2[27], kOne); - step1[28] = _mm256_add_epi16(step1[28], kOne); - step1[29] = _mm256_add_epi16(step1[29], kOne); - step1[30] = _mm256_add_epi16(step1[30], kOne); - step1[31] = _mm256_add_epi16(step1[31], kOne); - - step2[0] = _mm256_srai_epi16(step2[0], 2); - step2[1] = _mm256_srai_epi16(step2[1], 2); - step2[2] = _mm256_srai_epi16(step2[2], 2); - step2[3] = _mm256_srai_epi16(step2[3], 2); - step2[4] = _mm256_srai_epi16(step2[4], 2); - step2[5] = _mm256_srai_epi16(step2[5], 2); - step2[6] = _mm256_srai_epi16(step2[6], 2); - step2[7] = _mm256_srai_epi16(step2[7], 2); - step2[8] = _mm256_srai_epi16(step2[8], 2); - step2[9] = _mm256_srai_epi16(step2[9], 2); - step2[10] = _mm256_srai_epi16(step2[10], 2); - step2[11] = _mm256_srai_epi16(step2[11], 2); - step2[12] = _mm256_srai_epi16(step2[12], 2); - step2[13] = _mm256_srai_epi16(step2[13], 2); - step2[14] = _mm256_srai_epi16(step2[14], 2); - step2[15] = _mm256_srai_epi16(step2[15], 2); - step1[16] = _mm256_srai_epi16(step1[16], 2); - step1[17] = _mm256_srai_epi16(step1[17], 2); - step1[18] = _mm256_srai_epi16(step1[18], 2); - step1[19] = _mm256_srai_epi16(step1[19], 2); - step2[20] = _mm256_srai_epi16(step2[20], 2); - step2[21] = _mm256_srai_epi16(step2[21], 2); - step2[22] = _mm256_srai_epi16(step2[22], 2); - step2[23] = _mm256_srai_epi16(step2[23], 2); - step2[24] = _mm256_srai_epi16(step2[24], 2); - step2[25] = _mm256_srai_epi16(step2[25], 2); - step2[26] = _mm256_srai_epi16(step2[26], 2); - step2[27] = _mm256_srai_epi16(step2[27], 2); - step1[28] = _mm256_srai_epi16(step1[28], 2); - step1[29] = _mm256_srai_epi16(step1[29], 2); - step1[30] = _mm256_srai_epi16(step1[30], 2); - step1[31] = _mm256_srai_epi16(step1[31], 2); - } -#endif - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif - // Stage 3 - { - step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); - const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); - const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); - const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); - const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s3_10_4 = - _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = - _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = - _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = - _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = - _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = - _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = - _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = - _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm256_add_epi16(step2[23], step1[16]); - step3[17] = _mm256_add_epi16(step2[22], step1[17]); - step3[18] = _mm256_add_epi16(step2[21], step1[18]); - step3[19] = _mm256_add_epi16(step2[20], step1[19]); - step3[20] = _mm256_sub_epi16(step1[19], step2[20]); - step3[21] = _mm256_sub_epi16(step1[18], step2[21]); - step3[22] = _mm256_sub_epi16(step1[17], step2[22]); - step3[23] = _mm256_sub_epi16(step1[16], step2[23]); - step3[24] = _mm256_sub_epi16(step1[31], step2[24]); - step3[25] = _mm256_sub_epi16(step1[30], step2[25]); - step3[26] = _mm256_sub_epi16(step1[29], step2[26]); - step3[27] = _mm256_sub_epi16(step1[28], step2[27]); - step3[28] = _mm256_add_epi16(step2[27], step1[28]); - step3[29] = _mm256_add_epi16(step2[26], step1[29]); - step3[30] = _mm256_add_epi16(step2[25], step1[30]); - step3[31] = _mm256_add_epi16(step2[24], step1[31]); - } - - // Stage 4 - { - step1[0] = _mm256_add_epi16(step3[3], step3[0]); - step1[1] = _mm256_add_epi16(step3[2], step3[1]); - step1[2] = _mm256_sub_epi16(step3[1], step3[2]); - step1[3] = _mm256_sub_epi16(step3[0], step3[3]); - step1[8] = _mm256_add_epi16(step3[11], step2[8]); - step1[9] = _mm256_add_epi16(step3[10], step2[9]); - step1[10] = _mm256_sub_epi16(step2[9], step3[10]); - step1[11] = _mm256_sub_epi16(step2[8], step3[11]); - step1[12] = _mm256_sub_epi16(step2[15], step3[12]); - step1[13] = _mm256_sub_epi16(step2[14], step3[13]); - step1[14] = _mm256_add_epi16(step3[13], step2[14]); - step1[15] = _mm256_add_epi16(step3[12], step2[15]); - } - { - const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); - const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); - const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s1_05_4 = - _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m256i s1_05_5 = - _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m256i s1_06_4 = - _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m256i s1_06_5 = - _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); - const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); - const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); - const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); - const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); - const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); - const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); - const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); - const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s1_18_4 = - _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m256i s1_18_5 = - _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m256i s1_19_4 = - _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m256i s1_19_5 = - _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m256i s1_20_4 = - _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m256i s1_20_5 = - _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m256i s1_21_4 = - _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m256i s1_21_5 = - _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m256i s1_26_4 = - _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m256i s1_26_5 = - _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m256i s1_27_4 = - _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m256i s1_27_5 = - _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m256i s1_28_4 = - _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m256i s1_28_5 = - _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m256i s1_29_4 = - _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m256i s1_29_5 = - _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm256_add_epi16(step1[5], step3[4]); - step2[5] = _mm256_sub_epi16(step3[4], step1[5]); - step2[6] = _mm256_sub_epi16(step3[7], step1[6]); - step2[7] = _mm256_add_epi16(step1[6], step3[7]); - } - { - const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); - const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); - const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); - const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); - const __m256i out_00_2 = - _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m256i out_00_3 = - _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m256i out_16_2 = - _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m256i out_16_3 = - _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m256i out_08_2 = - _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m256i out_08_3 = - _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m256i out_24_2 = - _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m256i out_24_3 = - _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m256i out_00_4 = - _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m256i out_00_5 = - _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m256i out_16_4 = - _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m256i out_16_5 = - _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m256i out_08_4 = - _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m256i out_08_5 = - _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m256i out_24_4 = - _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m256i out_24_5 = - _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm256_packs_epi32(out_00_6, out_00_7); - out[16] = _mm256_packs_epi32(out_16_6, out_16_7); - out[8] = _mm256_packs_epi32(out_08_6, out_08_7); - out[24] = _mm256_packs_epi32(out_24_6, out_24_7); - } - { - const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); - const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); - const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); - const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); - const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s2_09_4 = - _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m256i s2_09_5 = - _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m256i s2_10_4 = - _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m256i s2_10_5 = - _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m256i s2_13_4 = - _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m256i s2_13_5 = - _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m256i s2_14_4 = - _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m256i s2_14_5 = - _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm256_add_epi16(step1[19], step3[16]); - step2[17] = _mm256_add_epi16(step1[18], step3[17]); - step2[18] = _mm256_sub_epi16(step3[17], step1[18]); - step2[19] = _mm256_sub_epi16(step3[16], step1[19]); - step2[20] = _mm256_sub_epi16(step3[23], step1[20]); - step2[21] = _mm256_sub_epi16(step3[22], step1[21]); - step2[22] = _mm256_add_epi16(step1[21], step3[22]); - step2[23] = _mm256_add_epi16(step1[20], step3[23]); - step2[24] = _mm256_add_epi16(step1[27], step3[24]); - step2[25] = _mm256_add_epi16(step1[26], step3[25]); - step2[26] = _mm256_sub_epi16(step3[25], step1[26]); - step2[27] = _mm256_sub_epi16(step3[24], step1[27]); - step2[28] = _mm256_sub_epi16(step3[31], step1[28]); - step2[29] = _mm256_sub_epi16(step3[30], step1[29]); - step2[30] = _mm256_add_epi16(step1[29], step3[30]); - step2[31] = _mm256_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_04_2 = - _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m256i out_04_3 = - _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m256i out_20_2 = - _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m256i out_20_3 = - _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m256i out_12_2 = - _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m256i out_12_3 = - _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m256i out_28_2 = - _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m256i out_28_3 = - _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m256i out_04_4 = - _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m256i out_04_5 = - _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m256i out_20_4 = - _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m256i out_20_5 = - _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m256i out_12_4 = - _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m256i out_12_5 = - _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m256i out_28_4 = - _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m256i out_28_5 = - _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm256_packs_epi32(out_04_6, out_04_7); - out[20] = _mm256_packs_epi32(out_20_6, out_20_7); - out[12] = _mm256_packs_epi32(out_12_6, out_12_7); - out[28] = _mm256_packs_epi32(out_28_6, out_28_7); - } - { - step3[8] = _mm256_add_epi16(step2[9], step1[8]); - step3[9] = _mm256_sub_epi16(step1[8], step2[9]); - step3[10] = _mm256_sub_epi16(step1[11], step2[10]); - step3[11] = _mm256_add_epi16(step2[10], step1[11]); - step3[12] = _mm256_add_epi16(step2[13], step1[12]); - step3[13] = _mm256_sub_epi16(step1[12], step2[13]); - step3[14] = _mm256_sub_epi16(step1[15], step2[14]); - step3[15] = _mm256_add_epi16(step2[14], step1[15]); - } - { - const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); - const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); - const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); - const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); - const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); - const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); - const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); - const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); - const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m256i s3_17_4 = - _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m256i s3_17_5 = - _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m256i s3_18_4 = - _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m256i s3_18_5 = - _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m256i s3_21_4 = - _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m256i s3_21_5 = - _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m256i s3_22_4 = - _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m256i s3_22_5 = - _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m256i s3_25_4 = - _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m256i s3_25_5 = - _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m256i s3_26_4 = - _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m256i s3_26_5 = - _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m256i s3_29_4 = - _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m256i s3_29_5 = - _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m256i s3_30_4 = - _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m256i s3_30_5 = - _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); - const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); - const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); - const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); - const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); - const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); - const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); - const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); - const __m256i out_02_2 = - _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m256i out_02_3 = - _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m256i out_18_2 = - _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m256i out_18_3 = - _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m256i out_10_2 = - _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m256i out_10_3 = - _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m256i out_26_2 = - _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m256i out_26_3 = - _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m256i out_06_2 = - _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m256i out_06_3 = - _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m256i out_22_2 = - _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m256i out_22_3 = - _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m256i out_14_2 = - _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m256i out_14_3 = - _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m256i out_30_2 = - _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m256i out_30_3 = - _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m256i out_02_4 = - _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m256i out_02_5 = - _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m256i out_18_4 = - _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m256i out_18_5 = - _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m256i out_10_4 = - _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m256i out_10_5 = - _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m256i out_26_4 = - _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m256i out_26_5 = - _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m256i out_06_4 = - _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m256i out_06_5 = - _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m256i out_22_4 = - _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m256i out_22_5 = - _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m256i out_14_4 = - _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m256i out_14_5 = - _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m256i out_30_4 = - _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m256i out_30_5 = - _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm256_packs_epi32(out_02_6, out_02_7); - out[18] = _mm256_packs_epi32(out_18_6, out_18_7); - out[10] = _mm256_packs_epi32(out_10_6, out_10_7); - out[26] = _mm256_packs_epi32(out_26_6, out_26_7); - out[6] = _mm256_packs_epi32(out_06_6, out_06_7); - out[22] = _mm256_packs_epi32(out_22_6, out_22_7); - out[14] = _mm256_packs_epi32(out_14_6, out_14_7); - out[30] = _mm256_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm256_add_epi16(step3[17], step2[16]); - step1[17] = _mm256_sub_epi16(step2[16], step3[17]); - step1[18] = _mm256_sub_epi16(step2[19], step3[18]); - step1[19] = _mm256_add_epi16(step3[18], step2[19]); - step1[20] = _mm256_add_epi16(step3[21], step2[20]); - step1[21] = _mm256_sub_epi16(step2[20], step3[21]); - step1[22] = _mm256_sub_epi16(step2[23], step3[22]); - step1[23] = _mm256_add_epi16(step3[22], step2[23]); - step1[24] = _mm256_add_epi16(step3[25], step2[24]); - step1[25] = _mm256_sub_epi16(step2[24], step3[25]); - step1[26] = _mm256_sub_epi16(step2[27], step3[26]); - step1[27] = _mm256_add_epi16(step3[26], step2[27]); - step1[28] = _mm256_add_epi16(step3[29], step2[28]); - step1[29] = _mm256_sub_epi16(step2[28], step3[29]); - step1[30] = _mm256_sub_epi16(step2[31], step3[30]); - step1[31] = _mm256_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); - const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); - const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); - const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); - const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); - const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); - const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); - const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); - const __m256i out_01_2 = - _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m256i out_01_3 = - _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m256i out_17_2 = - _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m256i out_17_3 = - _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m256i out_09_2 = - _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m256i out_09_3 = - _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m256i out_25_2 = - _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m256i out_25_3 = - _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m256i out_07_2 = - _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m256i out_07_3 = - _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m256i out_23_2 = - _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m256i out_23_3 = - _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m256i out_15_2 = - _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m256i out_15_3 = - _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m256i out_31_2 = - _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m256i out_31_3 = - _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m256i out_01_4 = - _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m256i out_01_5 = - _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m256i out_17_4 = - _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m256i out_17_5 = - _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m256i out_09_4 = - _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m256i out_09_5 = - _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m256i out_25_4 = - _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m256i out_25_5 = - _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m256i out_07_4 = - _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m256i out_07_5 = - _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m256i out_23_4 = - _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m256i out_23_5 = - _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m256i out_15_4 = - _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m256i out_15_5 = - _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m256i out_31_4 = - _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m256i out_31_5 = - _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm256_packs_epi32(out_01_6, out_01_7); - out[17] = _mm256_packs_epi32(out_17_6, out_17_7); - out[9] = _mm256_packs_epi32(out_09_6, out_09_7); - out[25] = _mm256_packs_epi32(out_25_6, out_25_7); - out[7] = _mm256_packs_epi32(out_07_6, out_07_7); - out[23] = _mm256_packs_epi32(out_23_6, out_23_7); - out[15] = _mm256_packs_epi32(out_15_6, out_15_7); - out[31] = _mm256_packs_epi32(out_31_6, out_31_7); - } - { - const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); - const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); - const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); - const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); - const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); - const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); - const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); - const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); - const __m256i out_05_2 = - _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m256i out_05_3 = - _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m256i out_21_2 = - _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m256i out_21_3 = - _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m256i out_13_2 = - _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m256i out_13_3 = - _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m256i out_29_2 = - _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m256i out_29_3 = - _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m256i out_03_2 = - _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m256i out_03_3 = - _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m256i out_19_2 = - _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m256i out_19_3 = - _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m256i out_11_2 = - _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m256i out_11_3 = - _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m256i out_27_2 = - _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m256i out_27_3 = - _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m256i out_05_4 = - _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m256i out_05_5 = - _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m256i out_21_4 = - _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m256i out_21_5 = - _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m256i out_13_4 = - _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m256i out_13_5 = - _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m256i out_29_4 = - _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m256i out_29_5 = - _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m256i out_03_4 = - _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m256i out_03_5 = - _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m256i out_19_4 = - _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m256i out_19_5 = - _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m256i out_11_4 = - _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m256i out_11_5 = - _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m256i out_27_4 = - _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m256i out_27_5 = - _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm256_packs_epi32(out_05_6, out_05_7); - out[21] = _mm256_packs_epi32(out_21_6, out_21_7); - out[13] = _mm256_packs_epi32(out_13_6, out_13_7); - out[29] = _mm256_packs_epi32(out_29_6, out_29_7); - out[3] = _mm256_packs_epi32(out_03_6, out_03_7); - out[19] = _mm256_packs_epi32(out_19_6, out_19_7); - out[11] = _mm256_packs_epi32(out_11_6, out_11_7); - out[27] = _mm256_packs_epi32(out_27_6, out_27_7); - } -#if FDCT32x32_HIGH_PRECISION - } else { - __m256i lstep1[64], lstep2[64], lstep3[64]; - __m256i u[32], v[32], sign[16]; - const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); - // start using 32-bit operations - // stage 3 - { - // expanding to 32-bit length priori to addition operations - lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); - lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); - lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); - lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); - lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); - lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); - lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); - lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); - lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero); - lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero); - lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero); - lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero); - lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero); - lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero); - lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero); - lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero); - lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne); - lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne); - lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne); - lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne); - lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne); - lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne); - lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne); - lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne); - lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne); - lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne); - lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); - lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); - lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); - lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne); - lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); - lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); - - lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); - lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); - lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); - lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); - lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); - lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); - lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); - lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); - lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]); - lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); - lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); - lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); - lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); - lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); - lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); - lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); - } - { - const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); - const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); - const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); - const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); - const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s3_10_4 = - _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = - _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = - _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = - _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = - _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = - _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = - _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = - _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); - lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); - lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); - lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); - lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); - lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); - lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); - lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); - } - { - lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); - lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); - lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); - lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); - lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); - lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); - lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); - lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); - lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero); - lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero); - lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero); - lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero); - lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero); - lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero); - lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero); - lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero); - lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne); - lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne); - lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne); - lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne); - lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne); - lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne); - lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne); - lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne); - lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne); - lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne); - lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne); - lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne); - lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne); - lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne); - lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne); - lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne); - - lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); - lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); - lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); - lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); - lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); - lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); - lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); - lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); - lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero); - lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero); - lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero); - lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero); - lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero); - lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero); - lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero); - lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero); - lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne); - lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne); - lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne); - lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne); - lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne); - lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne); - lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne); - lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne); - lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne); - lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne); - lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne); - lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne); - lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne); - lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne); - lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne); - lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne); - - lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); - lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); - - lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); - lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); - lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); - lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); - lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); - lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); - lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); - lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); - lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]); - lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); - lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); - lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); - lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); - lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); - lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); - lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); - lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); - lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]); - lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); - lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); - lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); - lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); - lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); - lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); - lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); - lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); - lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]); - lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]); - lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]); - lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]); - } - - // stage 4 - { - // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); - lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); - lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); - lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); - lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); - lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); - lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); - lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); - lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); - lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); - lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); - lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); - lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne); - lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne); - lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); - lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); - - lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); - lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); - lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); - lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); - lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); - lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); - lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); - lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); - lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); - lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); - lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); - lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); - lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); - lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); - lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); - lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); - lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); - lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); - lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); - lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); - lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); - lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); - lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); - lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); - } - { - // to be continued... - // - const __m256i k32_p16_p16 = - pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = - pair256_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide - // instruction latency. - v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - } - { - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = - pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); - u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); - u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); - u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); - u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); - u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); - u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); - u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); - u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); - u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); - u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); - u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); - u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); - u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); - u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); - u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); - v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); - v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); - v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); - v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); - v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); - v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); - v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); - v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); - v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); - v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); - v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); - v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); - v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); - v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); - v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); - v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); - v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); - v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); - v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); - v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); - v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); - v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); - v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); - v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); - v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); - v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); - v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); - v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 5 - { - lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); - lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); - lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); - lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); - lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); - lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); - lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); - lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); - } - { - const __m256i k32_p16_p16 = - pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = - pair256_set_epi32(cospi_16_64, -cospi_16_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); - u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); - u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); - u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); - u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); - u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); - u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); - u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); - - // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide - // instruction latency. - v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); - v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); - v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); - v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); - v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); - v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); - - u[0] = _mm256_sub_epi32(u[0], sign[0]); - u[1] = _mm256_sub_epi32(u[1], sign[1]); - u[2] = _mm256_sub_epi32(u[2], sign[2]); - u[3] = _mm256_sub_epi32(u[3], sign[3]); - u[4] = _mm256_sub_epi32(u[4], sign[4]); - u[5] = _mm256_sub_epi32(u[5], sign[5]); - u[6] = _mm256_sub_epi32(u[6], sign[6]); - u[7] = _mm256_sub_epi32(u[7], sign[7]); - - u[0] = _mm256_add_epi32(u[0], K32One); - u[1] = _mm256_add_epi32(u[1], K32One); - u[2] = _mm256_add_epi32(u[2], K32One); - u[3] = _mm256_add_epi32(u[3], K32One); - u[4] = _mm256_add_epi32(u[4], K32One); - u[5] = _mm256_add_epi32(u[5], K32One); - u[6] = _mm256_add_epi32(u[6], K32One); - u[7] = _mm256_add_epi32(u[7], K32One); - - u[0] = _mm256_srai_epi32(u[0], 2); - u[1] = _mm256_srai_epi32(u[1], 2); - u[2] = _mm256_srai_epi32(u[2], 2); - u[3] = _mm256_srai_epi32(u[3], 2); - u[4] = _mm256_srai_epi32(u[4], 2); - u[5] = _mm256_srai_epi32(u[5], 2); - u[6] = _mm256_srai_epi32(u[6], 2); - u[7] = _mm256_srai_epi32(u[7], 2); - - // Combine - out[0] = _mm256_packs_epi32(u[0], u[1]); - out[16] = _mm256_packs_epi32(u[2], u[3]); - out[8] = _mm256_packs_epi32(u[4], u[5]); - out[24] = _mm256_packs_epi32(u[6], u[7]); - } - { - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = - pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); - u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); - u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); - u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); - u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); - u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); - u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); - u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); - v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); - v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); - v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); - v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); - v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); - v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); - v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); - v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); - v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); - v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); - v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); - v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); - lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); - lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); - lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); - lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); - lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); - lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); - lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); - } - { - lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); - lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); - lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); - lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); - lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); - lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); - lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); - lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); - lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); - lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); - lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); - lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); - lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); - lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); - lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); - lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); - lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); - lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); - lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); - lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); - lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); - lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); - lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); - lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); - lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); - lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); - lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); - lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); - lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); - lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); - lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); - lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); - } - // stage 6 - { - const __m256i k32_p28_p04 = - pair256_set_epi32(cospi_28_64, cospi_4_64); - const __m256i k32_p12_p20 = - pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_m20_p12 = - pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m04_p28 = - pair256_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); - u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); - u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); - u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); - u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); - u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); - u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); - u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); - u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); - u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); - u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); - u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); - u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); - u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); - u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); - v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); - v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); - v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); - v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); - v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); - v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); - v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); - v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); - v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); - v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); - v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); - v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); - v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); - v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); - v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); - - u[0] = _mm256_sub_epi32(u[0], sign[0]); - u[1] = _mm256_sub_epi32(u[1], sign[1]); - u[2] = _mm256_sub_epi32(u[2], sign[2]); - u[3] = _mm256_sub_epi32(u[3], sign[3]); - u[4] = _mm256_sub_epi32(u[4], sign[4]); - u[5] = _mm256_sub_epi32(u[5], sign[5]); - u[6] = _mm256_sub_epi32(u[6], sign[6]); - u[7] = _mm256_sub_epi32(u[7], sign[7]); - - u[0] = _mm256_add_epi32(u[0], K32One); - u[1] = _mm256_add_epi32(u[1], K32One); - u[2] = _mm256_add_epi32(u[2], K32One); - u[3] = _mm256_add_epi32(u[3], K32One); - u[4] = _mm256_add_epi32(u[4], K32One); - u[5] = _mm256_add_epi32(u[5], K32One); - u[6] = _mm256_add_epi32(u[6], K32One); - u[7] = _mm256_add_epi32(u[7], K32One); - - u[0] = _mm256_srai_epi32(u[0], 2); - u[1] = _mm256_srai_epi32(u[1], 2); - u[2] = _mm256_srai_epi32(u[2], 2); - u[3] = _mm256_srai_epi32(u[3], 2); - u[4] = _mm256_srai_epi32(u[4], 2); - u[5] = _mm256_srai_epi32(u[5], 2); - u[6] = _mm256_srai_epi32(u[6], 2); - u[7] = _mm256_srai_epi32(u[7], 2); - - out[4] = _mm256_packs_epi32(u[0], u[1]); - out[20] = _mm256_packs_epi32(u[2], u[3]); - out[12] = _mm256_packs_epi32(u[4], u[5]); - out[28] = _mm256_packs_epi32(u[6], u[7]); - } - { - lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); - lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); - lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); - lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); - lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); - lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); - lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); - lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); - lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); - lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); - lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); - lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); - lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); - lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); - lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); - lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); - } - { - const __m256i k32_m04_p28 = - pair256_set_epi32(-cospi_4_64, cospi_28_64); - const __m256i k32_m28_m04 = - pair256_set_epi32(-cospi_28_64, -cospi_4_64); - const __m256i k32_m20_p12 = - pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m12_m20 = - pair256_set_epi32(-cospi_12_64, -cospi_20_64); - const __m256i k32_p12_p20 = - pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_p28_p04 = - pair256_set_epi32(cospi_28_64, cospi_4_64); - - u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); - u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); - u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); - u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); - u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); - u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); - u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); - u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); - u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); - u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); - u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); - u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); - u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); - u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); - u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); - u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); - v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); - v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); - v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); - v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); - v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); - v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); - v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); - v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); - v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); - v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); - v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); - v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); - v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); - v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); - v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); - v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); - v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); - v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); - v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); - v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); - v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); - v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); - v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); - v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); - v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); - v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); - v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); - v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); - v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); - v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); - v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 7 - { - const __m256i k32_p30_p02 = - pair256_set_epi32(cospi_30_64, cospi_2_64); - const __m256i k32_p14_p18 = - pair256_set_epi32(cospi_14_64, cospi_18_64); - const __m256i k32_p22_p10 = - pair256_set_epi32(cospi_22_64, cospi_10_64); - const __m256i k32_p06_p26 = - pair256_set_epi32(cospi_6_64, cospi_26_64); - const __m256i k32_m26_p06 = - pair256_set_epi32(-cospi_26_64, cospi_6_64); - const __m256i k32_m10_p22 = - pair256_set_epi32(-cospi_10_64, cospi_22_64); - const __m256i k32_m18_p14 = - pair256_set_epi32(-cospi_18_64, cospi_14_64); - const __m256i k32_m02_p30 = - pair256_set_epi32(-cospi_2_64, cospi_30_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); - u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); - u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); - u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); - u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); - u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); - u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); - u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); - u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); - u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); - u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); - u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); - u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); - u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); - u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); - u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); - v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); - v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); - v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); - v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); - v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); - v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); - v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); - v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); - v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); - v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); - v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); - v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); - v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); - v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); - v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); - v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); - v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); - v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); - v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); - v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); - v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); - v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); - v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); - v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); - v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); - v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); - v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); - v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); - v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); - v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); - v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[2] = _mm256_packs_epi32(u[0], u[1]); - out[18] = _mm256_packs_epi32(u[2], u[3]); - out[10] = _mm256_packs_epi32(u[4], u[5]); - out[26] = _mm256_packs_epi32(u[6], u[7]); - out[6] = _mm256_packs_epi32(u[8], u[9]); - out[22] = _mm256_packs_epi32(u[10], u[11]); - out[14] = _mm256_packs_epi32(u[12], u[13]); - out[30] = _mm256_packs_epi32(u[14], u[15]); - } - { - lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); - lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); - lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); - lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); - lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); - lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); - lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); - lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); - lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); - lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); - lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); - lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); - lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); - lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); - lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); - lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); - lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); - lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); - lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); - lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); - lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); - lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); - lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); - lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); - lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); - lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); - lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); - lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); - lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); - lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); - lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); - lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); - } - // stage 8 - { - const __m256i k32_p31_p01 = - pair256_set_epi32(cospi_31_64, cospi_1_64); - const __m256i k32_p15_p17 = - pair256_set_epi32(cospi_15_64, cospi_17_64); - const __m256i k32_p23_p09 = - pair256_set_epi32(cospi_23_64, cospi_9_64); - const __m256i k32_p07_p25 = - pair256_set_epi32(cospi_7_64, cospi_25_64); - const __m256i k32_m25_p07 = - pair256_set_epi32(-cospi_25_64, cospi_7_64); - const __m256i k32_m09_p23 = - pair256_set_epi32(-cospi_9_64, cospi_23_64); - const __m256i k32_m17_p15 = - pair256_set_epi32(-cospi_17_64, cospi_15_64); - const __m256i k32_m01_p31 = - pair256_set_epi32(-cospi_1_64, cospi_31_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); - u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); - u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); - u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); - u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); - u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); - u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); - u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); - u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); - u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); - u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); - u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); - u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); - u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); - u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); - u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); - v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); - v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); - v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); - v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); - v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); - v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); - v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); - v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); - v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); - v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); - v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); - v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); - v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); - v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); - v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); - v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); - v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); - v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); - v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); - v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); - v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); - v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); - v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); - v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); - v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); - v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); - v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); - v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); - v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); - v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); - v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[1] = _mm256_packs_epi32(u[0], u[1]); - out[17] = _mm256_packs_epi32(u[2], u[3]); - out[9] = _mm256_packs_epi32(u[4], u[5]); - out[25] = _mm256_packs_epi32(u[6], u[7]); - out[7] = _mm256_packs_epi32(u[8], u[9]); - out[23] = _mm256_packs_epi32(u[10], u[11]); - out[15] = _mm256_packs_epi32(u[12], u[13]); - out[31] = _mm256_packs_epi32(u[14], u[15]); - } - { - const __m256i k32_p27_p05 = - pair256_set_epi32(cospi_27_64, cospi_5_64); - const __m256i k32_p11_p21 = - pair256_set_epi32(cospi_11_64, cospi_21_64); - const __m256i k32_p19_p13 = - pair256_set_epi32(cospi_19_64, cospi_13_64); - const __m256i k32_p03_p29 = - pair256_set_epi32(cospi_3_64, cospi_29_64); - const __m256i k32_m29_p03 = - pair256_set_epi32(-cospi_29_64, cospi_3_64); - const __m256i k32_m13_p19 = - pair256_set_epi32(-cospi_13_64, cospi_19_64); - const __m256i k32_m21_p11 = - pair256_set_epi32(-cospi_21_64, cospi_11_64); - const __m256i k32_m05_p27 = - pair256_set_epi32(-cospi_5_64, cospi_27_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); - u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); - u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); - u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); - u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); - u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); - u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); - u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); - u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); - u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); - u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); - u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); - u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); - u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); - u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); - u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); - v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); - v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); - v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); - v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); - v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); - v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); - v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); - v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); - v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); - v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); - v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); - v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); - v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); - v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); - v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); - v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); - v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); - v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); - v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); - v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); - v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); - v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); - v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); - v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); - v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); - v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); - v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); - v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); - v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); - v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); - v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[5] = _mm256_packs_epi32(u[0], u[1]); - out[21] = _mm256_packs_epi32(u[2], u[3]); - out[13] = _mm256_packs_epi32(u[4], u[5]); - out[29] = _mm256_packs_epi32(u[6], u[7]); - out[3] = _mm256_packs_epi32(u[8], u[9]); - out[19] = _mm256_packs_epi32(u[10], u[11]); - out[11] = _mm256_packs_epi32(u[12], u[13]); - out[27] = _mm256_packs_epi32(u[14], u[15]); - } - } -#endif - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output_currStep, *output_nextStep; - tran_low_t *curr_out, *next_out; - // Pass 0 - output_currStep = &intermediate[column_start * 32]; - output_nextStep = &intermediate[(column_start + 8) * 32]; - // Pass 1 - curr_out = &output_org[column_start * 32]; - next_out = &output_org[(column_start + 8) * 32]; - - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m256i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 - // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 - // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 - // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 - // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 - // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 - const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); - const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); - const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); - const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); - const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); - const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); - const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); - const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); - // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 - // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 - // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 - // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 - // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 - // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 - // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 - // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 - - const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); - const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); - const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); - const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); - const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); - const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); - const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); - const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); - // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 - // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 - // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 - // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 - // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 - // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 - // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 - // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 - __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 - // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 - // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 - // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 - // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 - // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 - // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 - // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); - __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); - __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); - __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); - __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); - __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); - __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); - __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in av1/encoder/av1_dct.c - tr2_0 = _mm256_add_epi16(tr2_0, kOne); - tr2_1 = _mm256_add_epi16(tr2_1, kOne); - tr2_2 = _mm256_add_epi16(tr2_2, kOne); - tr2_3 = _mm256_add_epi16(tr2_3, kOne); - tr2_4 = _mm256_add_epi16(tr2_4, kOne); - tr2_5 = _mm256_add_epi16(tr2_5, kOne); - tr2_6 = _mm256_add_epi16(tr2_6, kOne); - tr2_7 = _mm256_add_epi16(tr2_7, kOne); - tr2_0 = _mm256_srai_epi16(tr2_0, 2); - tr2_1 = _mm256_srai_epi16(tr2_1, 2); - tr2_2 = _mm256_srai_epi16(tr2_2, 2); - tr2_3 = _mm256_srai_epi16(tr2_3, 2); - tr2_4 = _mm256_srai_epi16(tr2_4, 2); - tr2_5 = _mm256_srai_epi16(tr2_5, 2); - tr2_6 = _mm256_srai_epi16(tr2_6, 2); - tr2_7 = _mm256_srai_epi16(tr2_7, 2); - } - if (0 == pass) { - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), - _mm256_castsi256_si128(tr2_0)); - _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), - _mm256_castsi256_si128(tr2_1)); - _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), - _mm256_castsi256_si128(tr2_2)); - _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), - _mm256_castsi256_si128(tr2_3)); - _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), - _mm256_castsi256_si128(tr2_4)); - _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), - _mm256_castsi256_si128(tr2_5)); - _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), - _mm256_castsi256_si128(tr2_6)); - _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), - _mm256_castsi256_si128(tr2_7)); - - _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), - _mm256_extractf128_si256(tr2_0, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), - _mm256_extractf128_si256(tr2_1, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), - _mm256_extractf128_si256(tr2_2, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), - _mm256_extractf128_si256(tr2_3, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), - _mm256_extractf128_si256(tr2_4, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), - _mm256_extractf128_si256(tr2_5, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), - _mm256_extractf128_si256(tr2_6, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), - _mm256_extractf128_si256(tr2_7, 1)); - // Process next 8x8 - output_currStep += 8; - output_nextStep += 8; - } - if (1 == pass) { - store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32); - store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32); - store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32); - store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32); - store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32); - store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32); - store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32); - store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32); - curr_out += 8; - next_out += 8; - } - } - } - } - } - _mm256_zeroupper(); -} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h deleted file mode 100644 index 69dd6af11..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h +++ /dev/null @@ -1,3201 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "aom_dsp/fwd_txfm.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// TODO(jingning) The high bit-depth version needs re-work for performance. -// The current SSE2 implementation also causes cross reference to the static -// functions in the C implementation file. -#if DCT_HIGH_BIT_DEPTH -#define ADD_EPI16 _mm_adds_epi16 -#define SUB_EPI16 _mm_subs_epi16 -#if FDCT32x32_HIGH_PRECISION -void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - aom_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} -#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c -#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c -#else -void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - aom_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} -#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c -#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c -#endif // FDCT32x32_HIGH_PRECISION -#else -#define ADD_EPI16 _mm_add_epi16 -#define SUB_EPI16 _mm_sub_epi16 -#endif // DCT_HIGH_BIT_DEPTH - -void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { - // Calculate pre-multiplied strides - const int str1 = stride; - const int str2 = 2 * stride; - const int str3 = 2 * stride + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - int pass; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 8) { - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - const int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m128i *step1a = &step1[0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m128i *step1a = &step1[4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m128i *step1a = &step1[8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); - __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); - __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); - __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); - __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); - __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); - __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); - __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); - step1[0] = ADD_EPI16(in00, in31); - step1[1] = ADD_EPI16(in01, in30); - step1[2] = ADD_EPI16(in02, in29); - step1[3] = ADD_EPI16(in03, in28); - step1[28] = SUB_EPI16(in03, in28); - step1[29] = SUB_EPI16(in02, in29); - step1[30] = SUB_EPI16(in01, in30); - step1[31] = SUB_EPI16(in00, in31); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], - &step1[3], &step1[28], &step1[29], - &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); - __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); - __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); - __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); - __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); - __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); - __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); - __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); - step1[4] = ADD_EPI16(in04, in27); - step1[5] = ADD_EPI16(in05, in26); - step1[6] = ADD_EPI16(in06, in25); - step1[7] = ADD_EPI16(in07, in24); - step1[24] = SUB_EPI16(in07, in24); - step1[25] = SUB_EPI16(in06, in25); - step1[26] = SUB_EPI16(in05, in26); - step1[27] = SUB_EPI16(in04, in27); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], - &step1[7], &step1[24], &step1[25], - &step1[26], &step1[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); - __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); - __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); - __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); - __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); - __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); - __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); - __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); - step1[8] = ADD_EPI16(in08, in23); - step1[9] = ADD_EPI16(in09, in22); - step1[10] = ADD_EPI16(in10, in21); - step1[11] = ADD_EPI16(in11, in20); - step1[20] = SUB_EPI16(in11, in20); - step1[21] = SUB_EPI16(in10, in21); - step1[22] = SUB_EPI16(in09, in22); - step1[23] = SUB_EPI16(in08, in23); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], - &step1[11], &step1[20], &step1[21], - &step1[22], &step1[23]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); - __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); - __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); - __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); - __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); - __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); - __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); - __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); - step1[12] = ADD_EPI16(in12, in19); - step1[13] = ADD_EPI16(in13, in18); - step1[14] = ADD_EPI16(in14, in17); - step1[15] = ADD_EPI16(in15, in16); - step1[16] = SUB_EPI16(in15, in16); - step1[17] = SUB_EPI16(in14, in17); - step1[18] = SUB_EPI16(in13, in18); - step1[19] = SUB_EPI16(in12, in19); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], - &step1[15], &step1[16], &step1[17], - &step1[18], &step1[19]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Stage 2 - { - step2[0] = ADD_EPI16(step1[0], step1[15]); - step2[1] = ADD_EPI16(step1[1], step1[14]); - step2[2] = ADD_EPI16(step1[2], step1[13]); - step2[3] = ADD_EPI16(step1[3], step1[12]); - step2[4] = ADD_EPI16(step1[4], step1[11]); - step2[5] = ADD_EPI16(step1[5], step1[10]); - step2[6] = ADD_EPI16(step1[6], step1[9]); - step2[7] = ADD_EPI16(step1[7], step1[8]); - step2[8] = SUB_EPI16(step1[7], step1[8]); - step2[9] = SUB_EPI16(step1[6], step1[9]); - step2[10] = SUB_EPI16(step1[5], step1[10]); - step2[11] = SUB_EPI16(step1[4], step1[11]); - step2[12] = SUB_EPI16(step1[3], step1[12]); - step2[13] = SUB_EPI16(step1[2], step1[13]); - step2[14] = SUB_EPI16(step1[1], step1[14]); - step2[15] = SUB_EPI16(step1[0], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], - &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); - __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); - __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); - __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); - __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); - __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); - __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); - __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); - __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); - __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); - __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); - __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); - __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); - __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); - __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); - __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); - __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); - __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); - __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); - __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); - __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); - __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); - __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); - - step2[0] = SUB_EPI16(step2[0], s3_00_0); - step2[1] = SUB_EPI16(step2[1], s3_01_0); - step2[2] = SUB_EPI16(step2[2], s3_02_0); - step2[3] = SUB_EPI16(step2[3], s3_03_0); - step2[4] = SUB_EPI16(step2[4], s3_04_0); - step2[5] = SUB_EPI16(step2[5], s3_05_0); - step2[6] = SUB_EPI16(step2[6], s3_06_0); - step2[7] = SUB_EPI16(step2[7], s3_07_0); - step2[8] = SUB_EPI16(step2[8], s2_08_0); - step2[9] = SUB_EPI16(step2[9], s2_09_0); - step2[10] = SUB_EPI16(step2[10], s3_10_0); - step2[11] = SUB_EPI16(step2[11], s3_11_0); - step2[12] = SUB_EPI16(step2[12], s3_12_0); - step2[13] = SUB_EPI16(step2[13], s3_13_0); - step2[14] = SUB_EPI16(step2[14], s2_14_0); - step2[15] = SUB_EPI16(step2[15], s2_15_0); - step1[16] = SUB_EPI16(step1[16], s3_16_0); - step1[17] = SUB_EPI16(step1[17], s3_17_0); - step1[18] = SUB_EPI16(step1[18], s3_18_0); - step1[19] = SUB_EPI16(step1[19], s3_19_0); - step2[20] = SUB_EPI16(step2[20], s3_20_0); - step2[21] = SUB_EPI16(step2[21], s3_21_0); - step2[22] = SUB_EPI16(step2[22], s3_22_0); - step2[23] = SUB_EPI16(step2[23], s3_23_0); - step2[24] = SUB_EPI16(step2[24], s3_24_0); - step2[25] = SUB_EPI16(step2[25], s3_25_0); - step2[26] = SUB_EPI16(step2[26], s3_26_0); - step2[27] = SUB_EPI16(step2[27], s3_27_0); - step1[28] = SUB_EPI16(step1[28], s3_28_0); - step1[29] = SUB_EPI16(step1[29], s3_29_0); - step1[30] = SUB_EPI16(step1[30], s3_30_0); - step1[31] = SUB_EPI16(step1[31], s3_31_0); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x32( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], - &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], - &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], - &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - step2[0] = _mm_add_epi16(step2[0], kOne); - step2[1] = _mm_add_epi16(step2[1], kOne); - step2[2] = _mm_add_epi16(step2[2], kOne); - step2[3] = _mm_add_epi16(step2[3], kOne); - step2[4] = _mm_add_epi16(step2[4], kOne); - step2[5] = _mm_add_epi16(step2[5], kOne); - step2[6] = _mm_add_epi16(step2[6], kOne); - step2[7] = _mm_add_epi16(step2[7], kOne); - step2[8] = _mm_add_epi16(step2[8], kOne); - step2[9] = _mm_add_epi16(step2[9], kOne); - step2[10] = _mm_add_epi16(step2[10], kOne); - step2[11] = _mm_add_epi16(step2[11], kOne); - step2[12] = _mm_add_epi16(step2[12], kOne); - step2[13] = _mm_add_epi16(step2[13], kOne); - step2[14] = _mm_add_epi16(step2[14], kOne); - step2[15] = _mm_add_epi16(step2[15], kOne); - step1[16] = _mm_add_epi16(step1[16], kOne); - step1[17] = _mm_add_epi16(step1[17], kOne); - step1[18] = _mm_add_epi16(step1[18], kOne); - step1[19] = _mm_add_epi16(step1[19], kOne); - step2[20] = _mm_add_epi16(step2[20], kOne); - step2[21] = _mm_add_epi16(step2[21], kOne); - step2[22] = _mm_add_epi16(step2[22], kOne); - step2[23] = _mm_add_epi16(step2[23], kOne); - step2[24] = _mm_add_epi16(step2[24], kOne); - step2[25] = _mm_add_epi16(step2[25], kOne); - step2[26] = _mm_add_epi16(step2[26], kOne); - step2[27] = _mm_add_epi16(step2[27], kOne); - step1[28] = _mm_add_epi16(step1[28], kOne); - step1[29] = _mm_add_epi16(step1[29], kOne); - step1[30] = _mm_add_epi16(step1[30], kOne); - step1[31] = _mm_add_epi16(step1[31], kOne); - - step2[0] = _mm_srai_epi16(step2[0], 2); - step2[1] = _mm_srai_epi16(step2[1], 2); - step2[2] = _mm_srai_epi16(step2[2], 2); - step2[3] = _mm_srai_epi16(step2[3], 2); - step2[4] = _mm_srai_epi16(step2[4], 2); - step2[5] = _mm_srai_epi16(step2[5], 2); - step2[6] = _mm_srai_epi16(step2[6], 2); - step2[7] = _mm_srai_epi16(step2[7], 2); - step2[8] = _mm_srai_epi16(step2[8], 2); - step2[9] = _mm_srai_epi16(step2[9], 2); - step2[10] = _mm_srai_epi16(step2[10], 2); - step2[11] = _mm_srai_epi16(step2[11], 2); - step2[12] = _mm_srai_epi16(step2[12], 2); - step2[13] = _mm_srai_epi16(step2[13], 2); - step2[14] = _mm_srai_epi16(step2[14], 2); - step2[15] = _mm_srai_epi16(step2[15], 2); - step1[16] = _mm_srai_epi16(step1[16], 2); - step1[17] = _mm_srai_epi16(step1[17], 2); - step1[18] = _mm_srai_epi16(step1[18], 2); - step1[19] = _mm_srai_epi16(step1[19], 2); - step2[20] = _mm_srai_epi16(step2[20], 2); - step2[21] = _mm_srai_epi16(step2[21], 2); - step2[22] = _mm_srai_epi16(step2[22], 2); - step2[23] = _mm_srai_epi16(step2[23], 2); - step2[24] = _mm_srai_epi16(step2[24], 2); - step2[25] = _mm_srai_epi16(step2[25], 2); - step2[26] = _mm_srai_epi16(step2[26], 2); - step2[27] = _mm_srai_epi16(step2[27], 2); - step1[28] = _mm_srai_epi16(step1[28], 2); - step1[29] = _mm_srai_epi16(step1[29], 2); - step1[30] = _mm_srai_epi16(step1[30], 2); - step1[31] = _mm_srai_epi16(step1[31], 2); - } -#endif // !FDCT32x32_HIGH_PRECISION - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif - // Stage 3 - { - step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); - step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); - step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); - step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); - step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); - step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); - step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); - step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], - &step3[3], &step3[4], &step3[5], - &step3[6], &step3[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], - &step3[13]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[16] = ADD_EPI16(step2[23], step1[16]); - step3[17] = ADD_EPI16(step2[22], step1[17]); - step3[18] = ADD_EPI16(step2[21], step1[18]); - step3[19] = ADD_EPI16(step2[20], step1[19]); - step3[20] = SUB_EPI16(step1[19], step2[20]); - step3[21] = SUB_EPI16(step1[18], step2[21]); - step3[22] = SUB_EPI16(step1[17], step2[22]); - step3[23] = SUB_EPI16(step1[16], step2[23]); - step3[24] = SUB_EPI16(step1[31], step2[24]); - step3[25] = SUB_EPI16(step1[30], step2[25]); - step3[26] = SUB_EPI16(step1[29], step2[26]); - step3[27] = SUB_EPI16(step1[28], step2[27]); - step3[28] = ADD_EPI16(step2[27], step1[28]); - step3[29] = ADD_EPI16(step2[26], step1[29]); - step3[30] = ADD_EPI16(step2[25], step1[30]); - step3[31] = ADD_EPI16(step2[24], step1[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], - &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], - &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], - &step3[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - - // Stage 4 - { - step1[0] = ADD_EPI16(step3[3], step3[0]); - step1[1] = ADD_EPI16(step3[2], step3[1]); - step1[2] = SUB_EPI16(step3[1], step3[2]); - step1[3] = SUB_EPI16(step3[0], step3[3]); - step1[8] = ADD_EPI16(step3[11], step2[8]); - step1[9] = ADD_EPI16(step3[10], step2[9]); - step1[10] = SUB_EPI16(step2[9], step3[10]); - step1[11] = SUB_EPI16(step2[8], step3[11]); - step1[12] = SUB_EPI16(step2[15], step3[12]); - step1[13] = SUB_EPI16(step2[14], step3[13]); - step1[14] = ADD_EPI16(step3[13], step2[14]); - step1[15] = ADD_EPI16(step3[12], step2[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], - &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], - &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], - &step1[21], &step1[26], &step1[27], - &step1[28], &step1[29]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 5 - { - step2[4] = ADD_EPI16(step1[5], step3[4]); - step2[5] = SUB_EPI16(step3[4], step1[5]); - step2[6] = SUB_EPI16(step3[7], step1[6]); - step2[7] = ADD_EPI16(step1[6], step3[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], - &step2[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = - _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = - _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = - _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = - _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = - _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = - _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = - _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = - _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], - &step2[14]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step2[16] = ADD_EPI16(step1[19], step3[16]); - step2[17] = ADD_EPI16(step1[18], step3[17]); - step2[18] = SUB_EPI16(step3[17], step1[18]); - step2[19] = SUB_EPI16(step3[16], step1[19]); - step2[20] = SUB_EPI16(step3[23], step1[20]); - step2[21] = SUB_EPI16(step3[22], step1[21]); - step2[22] = ADD_EPI16(step1[21], step3[22]); - step2[23] = ADD_EPI16(step1[20], step3[23]); - step2[24] = ADD_EPI16(step1[27], step3[24]); - step2[25] = ADD_EPI16(step1[26], step3[25]); - step2[26] = SUB_EPI16(step3[25], step1[26]); - step2[27] = SUB_EPI16(step3[24], step1[27]); - step2[28] = SUB_EPI16(step3[31], step1[28]); - step2[29] = SUB_EPI16(step3[30], step1[29]); - step2[30] = ADD_EPI16(step1[29], step3[30]); - step2[31] = ADD_EPI16(step1[28], step3[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], - &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], - &step2[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = - _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = - _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = - _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = - _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = - _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = - _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = - _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = - _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[8] = ADD_EPI16(step2[9], step1[8]); - step3[9] = SUB_EPI16(step1[8], step2[9]); - step3[10] = SUB_EPI16(step1[11], step2[10]); - step3[11] = ADD_EPI16(step2[10], step1[11]); - step3[12] = ADD_EPI16(step2[13], step1[12]); - step3[13] = SUB_EPI16(step1[12], step2[13]); - step3[14] = SUB_EPI16(step1[15], step2[14]); - step3[15] = ADD_EPI16(step2[14], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], - &step3[11], &step3[12], &step3[13], - &step3[14], &step3[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], - &step3[22], &step3[25], &step3[26], - &step3[29], &step3[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = - _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = - _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = - _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = - _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = - _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = - _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = - _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = - _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = - _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = - _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = - _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = - _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = - _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = - _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = - _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = - _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step1[16] = ADD_EPI16(step3[17], step2[16]); - step1[17] = SUB_EPI16(step2[16], step3[17]); - step1[18] = SUB_EPI16(step2[19], step3[18]); - step1[19] = ADD_EPI16(step3[18], step2[19]); - step1[20] = ADD_EPI16(step3[21], step2[20]); - step1[21] = SUB_EPI16(step2[20], step3[21]); - step1[22] = SUB_EPI16(step2[23], step3[22]); - step1[23] = ADD_EPI16(step3[22], step2[23]); - step1[24] = ADD_EPI16(step3[25], step2[24]); - step1[25] = SUB_EPI16(step2[24], step3[25]); - step1[26] = SUB_EPI16(step2[27], step3[26]); - step1[27] = ADD_EPI16(step3[26], step2[27]); - step1[28] = ADD_EPI16(step3[29], step2[28]); - step1[29] = SUB_EPI16(step2[28], step3[29]); - step1[30] = SUB_EPI16(step2[31], step3[30]); - step1[31] = ADD_EPI16(step3[30], step2[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], - &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], - &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], - &step1[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = - _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = - _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = - _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = - _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = - _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = - _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = - _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = - _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = - _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = - _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = - _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = - _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = - _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = - _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = - _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = - _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = - _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = - _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = - _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = - _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = - _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = - _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = - _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = - _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = - _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = - _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = - _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = - _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = - _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = - _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = - _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = - _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } -#if FDCT32x32_HIGH_PRECISION - } else { - __m128i lstep1[64], lstep2[64], lstep3[64]; - __m128i u[32], v[32], sign[16]; - const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); - // start using 32-bit operations - // stage 3 - { - // expanding to 32-bit length priori to addition operations - lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); - lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); - lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); - lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); - lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); - lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); - lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); - lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); - lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); - lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); - lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); - lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); - lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); - lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); - lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); - lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); - lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); - lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); - lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); - lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); - lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); - lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); - lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); - lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); - lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); - lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); - lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); - lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); - lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); - lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); - lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); - lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); - - lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); - lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); - lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); - lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); - lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); - lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); - lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); - lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); - lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); - lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); - lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); - lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); - lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); - lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); - lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); - lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - } - { - lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); - lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); - lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); - lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); - lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); - lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); - lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); - lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); - lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); - lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); - lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); - lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); - lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); - lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); - lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); - lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); - lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); - lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); - lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); - lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); - lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); - lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); - lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); - lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); - lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); - lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); - lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); - lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); - lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); - lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); - lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); - lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); - - lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); - lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); - lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); - lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); - lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); - lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); - lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); - lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); - lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); - lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); - lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); - lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); - lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); - lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); - lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); - lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); - lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); - lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); - lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); - lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); - lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); - lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); - lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); - lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); - lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); - lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); - lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); - lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); - lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); - lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); - lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); - lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); - - lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); - lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); - - lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); - lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); - lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); - lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); - lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); - lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); - lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); - lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); - lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); - lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); - lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); - lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); - lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); - lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); - lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); - lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); - lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); - lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); - lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); - lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); - lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); - lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); - lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); - lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); - lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); - lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); - lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); - lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); - lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); - lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); - } - - // stage 4 - { - // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); - lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); - lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); - lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); - lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); - lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); - lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); - lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); - lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); - lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); - lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); - lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); - lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); - lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); - lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); - lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); - - lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); - lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); - lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); - lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); - lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); - lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); - lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); - lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); - lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); - lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); - lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); - lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); - lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); - lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); - lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); - lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); - lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); - lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); - lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); - lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); - lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); - lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); - lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); - lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); - } - { - // to be continued... - // - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_m16); - v[1] = k_madd_epi32(u[1], k32_p16_m16); - v[2] = k_madd_epi32(u[2], k32_p16_m16); - v[3] = k_madd_epi32(u[3], k32_p16_m16); - v[4] = k_madd_epi32(u[0], k32_p16_p16); - v[5] = k_madd_epi32(u[1], k32_p16_p16); - v[6] = k_madd_epi32(u[2], k32_p16_p16); - v[7] = k_madd_epi32(u[3], k32_p16_p16); -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], - &v[5], &v[6], &v[7], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); - u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); - u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); - u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); - u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); - u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); - u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); - u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); - u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); - u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); - u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); - u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); - u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); - u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); - u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); - u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m08_p24); - v[5] = k_madd_epi32(u[5], k32_m08_p24); - v[6] = k_madd_epi32(u[6], k32_m08_p24); - v[7] = k_madd_epi32(u[7], k32_m08_p24); - v[8] = k_madd_epi32(u[8], k32_m24_m08); - v[9] = k_madd_epi32(u[9], k32_m24_m08); - v[10] = k_madd_epi32(u[10], k32_m24_m08); - v[11] = k_madd_epi32(u[11], k32_m24_m08); - v[12] = k_madd_epi32(u[12], k32_m24_m08); - v[13] = k_madd_epi32(u[13], k32_m24_m08); - v[14] = k_madd_epi32(u[14], k32_m24_m08); - v[15] = k_madd_epi32(u[15], k32_m24_m08); - v[16] = k_madd_epi32(u[12], k32_m08_p24); - v[17] = k_madd_epi32(u[13], k32_m08_p24); - v[18] = k_madd_epi32(u[14], k32_m08_p24); - v[19] = k_madd_epi32(u[15], k32_m08_p24); - v[20] = k_madd_epi32(u[8], k32_m08_p24); - v[21] = k_madd_epi32(u[9], k32_m08_p24); - v[22] = k_madd_epi32(u[10], k32_m08_p24); - v[23] = k_madd_epi32(u[11], k32_m08_p24); - v[24] = k_madd_epi32(u[4], k32_p24_p08); - v[25] = k_madd_epi32(u[5], k32_p24_p08); - v[26] = k_madd_epi32(u[6], k32_p24_p08); - v[27] = k_madd_epi32(u[7], k32_p24_p08); - v[28] = k_madd_epi32(u[0], k32_p24_p08); - v[29] = k_madd_epi32(u[1], k32_p24_p08); - v[30] = k_madd_epi32(u[2], k32_p24_p08); - v[31] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 5 - { - lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); - lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); - lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); - lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); - lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); - lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); - lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); - lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); - } - { - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - - u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); - u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); - u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); - u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); - u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); - u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); - u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); - u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_p16); - v[1] = k_madd_epi32(u[1], k32_p16_p16); - v[2] = k_madd_epi32(u[2], k32_p16_p16); - v[3] = k_madd_epi32(u[3], k32_p16_p16); - v[4] = k_madd_epi32(u[0], k32_p16_m16); - v[5] = k_madd_epi32(u[1], k32_p16_m16); - v[6] = k_madd_epi32(u[2], k32_p16_m16); - v[7] = k_madd_epi32(u[3], k32_p16_m16); - v[8] = k_madd_epi32(u[4], k32_p24_p08); - v[9] = k_madd_epi32(u[5], k32_p24_p08); - v[10] = k_madd_epi32(u[6], k32_p24_p08); - v[11] = k_madd_epi32(u[7], k32_p24_p08); - v[12] = k_madd_epi32(u[4], k32_m08_p24); - v[13] = k_madd_epi32(u[5], k32_m08_p24); - v[14] = k_madd_epi32(u[6], k32_m08_p24); - v[15] = k_madd_epi32(u[7], k32_m08_p24); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - // Combine - out[0] = _mm_packs_epi32(u[0], u[1]); - out[16] = _mm_packs_epi32(u[2], u[3]); - out[8] = _mm_packs_epi32(u[4], u[5]); - out[24] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); - u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); - u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); - u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); - u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); - u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); - u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); - u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m24_m08); - v[5] = k_madd_epi32(u[5], k32_m24_m08); - v[6] = k_madd_epi32(u[6], k32_m24_m08); - v[7] = k_madd_epi32(u[7], k32_m24_m08); - v[8] = k_madd_epi32(u[4], k32_m08_p24); - v[9] = k_madd_epi32(u[5], k32_m08_p24); - v[10] = k_madd_epi32(u[6], k32_m08_p24); - v[11] = k_madd_epi32(u[7], k32_m08_p24); - v[12] = k_madd_epi32(u[0], k32_p24_p08); - v[13] = k_madd_epi32(u[1], k32_p24_p08); - v[14] = k_madd_epi32(u[2], k32_p24_p08); - v[15] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - } - { - lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); - lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); - lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); - lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); - lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); - lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); - lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); - lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); - lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); - lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); - lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); - lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); - lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); - lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); - lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); - lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); - lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); - lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); - lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); - lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); - lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); - lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); - lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); - lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); - lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); - lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); - lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); - lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); - lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); - lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); - lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); - lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); - } - // stage 6 - { - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - - v[0] = k_madd_epi32(u[0], k32_p28_p04); - v[1] = k_madd_epi32(u[1], k32_p28_p04); - v[2] = k_madd_epi32(u[2], k32_p28_p04); - v[3] = k_madd_epi32(u[3], k32_p28_p04); - v[4] = k_madd_epi32(u[4], k32_p12_p20); - v[5] = k_madd_epi32(u[5], k32_p12_p20); - v[6] = k_madd_epi32(u[6], k32_p12_p20); - v[7] = k_madd_epi32(u[7], k32_p12_p20); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m04_p28); - v[13] = k_madd_epi32(u[13], k32_m04_p28); - v[14] = k_madd_epi32(u[14], k32_m04_p28); - v[15] = k_madd_epi32(u[15], k32_m04_p28); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - out[4] = _mm_packs_epi32(u[0], u[1]); - out[20] = _mm_packs_epi32(u[2], u[3]); - out[12] = _mm_packs_epi32(u[4], u[5]); - out[28] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); - lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); - lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); - lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); - lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); - lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); - lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); - lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); - lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); - lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); - lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); - lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); - lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); - lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); - lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); - lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); - } - { - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m12_m20 = - pair_set_epi32(-cospi_12_64, -cospi_20_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - - u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); - u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); - u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); - u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); - u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); - u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); - u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); - u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); - u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); - u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); - u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); - u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); - u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); - u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); - u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); - u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); - - v[0] = k_madd_epi32(u[0], k32_m04_p28); - v[1] = k_madd_epi32(u[1], k32_m04_p28); - v[2] = k_madd_epi32(u[2], k32_m04_p28); - v[3] = k_madd_epi32(u[3], k32_m04_p28); - v[4] = k_madd_epi32(u[4], k32_m28_m04); - v[5] = k_madd_epi32(u[5], k32_m28_m04); - v[6] = k_madd_epi32(u[6], k32_m28_m04); - v[7] = k_madd_epi32(u[7], k32_m28_m04); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m12_m20); - v[13] = k_madd_epi32(u[13], k32_m12_m20); - v[14] = k_madd_epi32(u[14], k32_m12_m20); - v[15] = k_madd_epi32(u[15], k32_m12_m20); - v[16] = k_madd_epi32(u[12], k32_m20_p12); - v[17] = k_madd_epi32(u[13], k32_m20_p12); - v[18] = k_madd_epi32(u[14], k32_m20_p12); - v[19] = k_madd_epi32(u[15], k32_m20_p12); - v[20] = k_madd_epi32(u[8], k32_p12_p20); - v[21] = k_madd_epi32(u[9], k32_p12_p20); - v[22] = k_madd_epi32(u[10], k32_p12_p20); - v[23] = k_madd_epi32(u[11], k32_p12_p20); - v[24] = k_madd_epi32(u[4], k32_m04_p28); - v[25] = k_madd_epi32(u[5], k32_m04_p28); - v[26] = k_madd_epi32(u[6], k32_m04_p28); - v[27] = k_madd_epi32(u[7], k32_m04_p28); - v[28] = k_madd_epi32(u[0], k32_p28_p04); - v[29] = k_madd_epi32(u[1], k32_p28_p04); - v[30] = k_madd_epi32(u[2], k32_p28_p04); - v[31] = k_madd_epi32(u[3], k32_p28_p04); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 7 - { - const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); - const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); - const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); - const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); - const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); - const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); - const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); - const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); - - u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); - u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); - u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); - u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); - u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); - u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); - u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); - u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); - u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); - u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); - u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); - u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); - u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); - u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); - u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); - u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); - - v[0] = k_madd_epi32(u[0], k32_p30_p02); - v[1] = k_madd_epi32(u[1], k32_p30_p02); - v[2] = k_madd_epi32(u[2], k32_p30_p02); - v[3] = k_madd_epi32(u[3], k32_p30_p02); - v[4] = k_madd_epi32(u[4], k32_p14_p18); - v[5] = k_madd_epi32(u[5], k32_p14_p18); - v[6] = k_madd_epi32(u[6], k32_p14_p18); - v[7] = k_madd_epi32(u[7], k32_p14_p18); - v[8] = k_madd_epi32(u[8], k32_p22_p10); - v[9] = k_madd_epi32(u[9], k32_p22_p10); - v[10] = k_madd_epi32(u[10], k32_p22_p10); - v[11] = k_madd_epi32(u[11], k32_p22_p10); - v[12] = k_madd_epi32(u[12], k32_p06_p26); - v[13] = k_madd_epi32(u[13], k32_p06_p26); - v[14] = k_madd_epi32(u[14], k32_p06_p26); - v[15] = k_madd_epi32(u[15], k32_p06_p26); - v[16] = k_madd_epi32(u[12], k32_m26_p06); - v[17] = k_madd_epi32(u[13], k32_m26_p06); - v[18] = k_madd_epi32(u[14], k32_m26_p06); - v[19] = k_madd_epi32(u[15], k32_m26_p06); - v[20] = k_madd_epi32(u[8], k32_m10_p22); - v[21] = k_madd_epi32(u[9], k32_m10_p22); - v[22] = k_madd_epi32(u[10], k32_m10_p22); - v[23] = k_madd_epi32(u[11], k32_m10_p22); - v[24] = k_madd_epi32(u[4], k32_m18_p14); - v[25] = k_madd_epi32(u[5], k32_m18_p14); - v[26] = k_madd_epi32(u[6], k32_m18_p14); - v[27] = k_madd_epi32(u[7], k32_m18_p14); - v[28] = k_madd_epi32(u[0], k32_m02_p30); - v[29] = k_madd_epi32(u[1], k32_m02_p30); - v[30] = k_madd_epi32(u[2], k32_m02_p30); - v[31] = k_madd_epi32(u[3], k32_m02_p30); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[2] = _mm_packs_epi32(u[0], u[1]); - out[18] = _mm_packs_epi32(u[2], u[3]); - out[10] = _mm_packs_epi32(u[4], u[5]); - out[26] = _mm_packs_epi32(u[6], u[7]); - out[6] = _mm_packs_epi32(u[8], u[9]); - out[22] = _mm_packs_epi32(u[10], u[11]); - out[14] = _mm_packs_epi32(u[12], u[13]); - out[30] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); - lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); - lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); - lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); - lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); - lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); - lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); - lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); - lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); - lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); - lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); - lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); - lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); - lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); - lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); - lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); - lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); - lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); - lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); - lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); - lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); - lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); - lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); - lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); - lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); - lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); - lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); - lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); - lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); - lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); - lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); - lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); - } - // stage 8 - { - const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); - const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); - const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); - const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); - const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); - const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); - const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); - const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); - - u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); - u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); - u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); - u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); - u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); - u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); - u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); - u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); - u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); - u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); - u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); - u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); - u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); - u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); - u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); - u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); - - v[0] = k_madd_epi32(u[0], k32_p31_p01); - v[1] = k_madd_epi32(u[1], k32_p31_p01); - v[2] = k_madd_epi32(u[2], k32_p31_p01); - v[3] = k_madd_epi32(u[3], k32_p31_p01); - v[4] = k_madd_epi32(u[4], k32_p15_p17); - v[5] = k_madd_epi32(u[5], k32_p15_p17); - v[6] = k_madd_epi32(u[6], k32_p15_p17); - v[7] = k_madd_epi32(u[7], k32_p15_p17); - v[8] = k_madd_epi32(u[8], k32_p23_p09); - v[9] = k_madd_epi32(u[9], k32_p23_p09); - v[10] = k_madd_epi32(u[10], k32_p23_p09); - v[11] = k_madd_epi32(u[11], k32_p23_p09); - v[12] = k_madd_epi32(u[12], k32_p07_p25); - v[13] = k_madd_epi32(u[13], k32_p07_p25); - v[14] = k_madd_epi32(u[14], k32_p07_p25); - v[15] = k_madd_epi32(u[15], k32_p07_p25); - v[16] = k_madd_epi32(u[12], k32_m25_p07); - v[17] = k_madd_epi32(u[13], k32_m25_p07); - v[18] = k_madd_epi32(u[14], k32_m25_p07); - v[19] = k_madd_epi32(u[15], k32_m25_p07); - v[20] = k_madd_epi32(u[8], k32_m09_p23); - v[21] = k_madd_epi32(u[9], k32_m09_p23); - v[22] = k_madd_epi32(u[10], k32_m09_p23); - v[23] = k_madd_epi32(u[11], k32_m09_p23); - v[24] = k_madd_epi32(u[4], k32_m17_p15); - v[25] = k_madd_epi32(u[5], k32_m17_p15); - v[26] = k_madd_epi32(u[6], k32_m17_p15); - v[27] = k_madd_epi32(u[7], k32_m17_p15); - v[28] = k_madd_epi32(u[0], k32_m01_p31); - v[29] = k_madd_epi32(u[1], k32_m01_p31); - v[30] = k_madd_epi32(u[2], k32_m01_p31); - v[31] = k_madd_epi32(u[3], k32_m01_p31); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[1] = _mm_packs_epi32(u[0], u[1]); - out[17] = _mm_packs_epi32(u[2], u[3]); - out[9] = _mm_packs_epi32(u[4], u[5]); - out[25] = _mm_packs_epi32(u[6], u[7]); - out[7] = _mm_packs_epi32(u[8], u[9]); - out[23] = _mm_packs_epi32(u[10], u[11]); - out[15] = _mm_packs_epi32(u[12], u[13]); - out[31] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); - const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); - const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); - const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); - const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); - const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); - const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); - const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); - - u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); - u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); - u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); - u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); - u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); - u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); - u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); - u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); - u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); - u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); - u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); - u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); - u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); - u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); - u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); - u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); - - v[0] = k_madd_epi32(u[0], k32_p27_p05); - v[1] = k_madd_epi32(u[1], k32_p27_p05); - v[2] = k_madd_epi32(u[2], k32_p27_p05); - v[3] = k_madd_epi32(u[3], k32_p27_p05); - v[4] = k_madd_epi32(u[4], k32_p11_p21); - v[5] = k_madd_epi32(u[5], k32_p11_p21); - v[6] = k_madd_epi32(u[6], k32_p11_p21); - v[7] = k_madd_epi32(u[7], k32_p11_p21); - v[8] = k_madd_epi32(u[8], k32_p19_p13); - v[9] = k_madd_epi32(u[9], k32_p19_p13); - v[10] = k_madd_epi32(u[10], k32_p19_p13); - v[11] = k_madd_epi32(u[11], k32_p19_p13); - v[12] = k_madd_epi32(u[12], k32_p03_p29); - v[13] = k_madd_epi32(u[13], k32_p03_p29); - v[14] = k_madd_epi32(u[14], k32_p03_p29); - v[15] = k_madd_epi32(u[15], k32_p03_p29); - v[16] = k_madd_epi32(u[12], k32_m29_p03); - v[17] = k_madd_epi32(u[13], k32_m29_p03); - v[18] = k_madd_epi32(u[14], k32_m29_p03); - v[19] = k_madd_epi32(u[15], k32_m29_p03); - v[20] = k_madd_epi32(u[8], k32_m13_p19); - v[21] = k_madd_epi32(u[9], k32_m13_p19); - v[22] = k_madd_epi32(u[10], k32_m13_p19); - v[23] = k_madd_epi32(u[11], k32_m13_p19); - v[24] = k_madd_epi32(u[4], k32_m21_p11); - v[25] = k_madd_epi32(u[5], k32_m21_p11); - v[26] = k_madd_epi32(u[6], k32_m21_p11); - v[27] = k_madd_epi32(u[7], k32_m21_p11); - v[28] = k_madd_epi32(u[0], k32_m05_p27); - v[29] = k_madd_epi32(u[1], k32_m05_p27); - v[30] = k_madd_epi32(u[2], k32_m05_p27); - v[31] = k_madd_epi32(u[3], k32_m05_p27); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[5] = _mm_packs_epi32(u[0], u[1]); - out[21] = _mm_packs_epi32(u[2], u[3]); - out[13] = _mm_packs_epi32(u[4], u[5]); - out[29] = _mm_packs_epi32(u[6], u[7]); - out[3] = _mm_packs_epi32(u[8], u[9]); - out[19] = _mm_packs_epi32(u[10], u[11]); - out[11] = _mm_packs_epi32(u[12], u[13]); - out[27] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } -#endif // FDCT32x32_HIGH_PRECISION - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output0 = &intermediate[column_start * 32]; - tran_low_t *output1 = &output_org[column_start * 32]; - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m128i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); - __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); - __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); - __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); - __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); - __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); - __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); - __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in av1/encoder/av1_dct.c - tr2_0 = _mm_add_epi16(tr2_0, kOne); - tr2_1 = _mm_add_epi16(tr2_1, kOne); - tr2_2 = _mm_add_epi16(tr2_2, kOne); - tr2_3 = _mm_add_epi16(tr2_3, kOne); - tr2_4 = _mm_add_epi16(tr2_4, kOne); - tr2_5 = _mm_add_epi16(tr2_5, kOne); - tr2_6 = _mm_add_epi16(tr2_6, kOne); - tr2_7 = _mm_add_epi16(tr2_7, kOne); - tr2_0 = _mm_srai_epi16(tr2_0, 2); - tr2_1 = _mm_srai_epi16(tr2_1, 2); - tr2_2 = _mm_srai_epi16(tr2_2, 2); - tr2_3 = _mm_srai_epi16(tr2_3, 2); - tr2_4 = _mm_srai_epi16(tr2_4, 2); - tr2_5 = _mm_srai_epi16(tr2_5, 2); - tr2_6 = _mm_srai_epi16(tr2_6, 2); - tr2_7 = _mm_srai_epi16(tr2_7, 2); - } - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - if (pass == 0) { - _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); - _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); - _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); - _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); - _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); - _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); - _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); - _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); - // Process next 8x8 - output0 += 8; - } else { - storeu_output(&tr2_0, (output1 + 0 * 32)); - storeu_output(&tr2_1, (output1 + 1 * 32)); - storeu_output(&tr2_2, (output1 + 2 * 32)); - storeu_output(&tr2_3, (output1 + 3 * 32)); - storeu_output(&tr2_4, (output1 + 4 * 32)); - storeu_output(&tr2_5, (output1 + 5 * 32)); - storeu_output(&tr2_6, (output1 + 6 * 32)); - storeu_output(&tr2_7, (output1 + 7 * 32)); - // Process next 8x8 - output1 += 8; - } - } - } - } - } -} // NOLINT - -#undef ADD_EPI16 -#undef SUB_EPI16 -#undef HIGH_FDCT32x32_2D_C -#undef HIGH_FDCT32x32_2D_ROWS_C diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c deleted file mode 100644 index 670f864d0..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" - -#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h deleted file mode 100644 index 86df4a6f6..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H -#define AOM_DSP_X86_FWD_TXFM_AVX2_H - -#include "./aom_config.h" - -static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { - if (sizeof(tran_low_t) == 4) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); - } else { - _mm256_storeu_si256((__m256i *)out, *coeff); - } -} - -#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h index 7bb1db70a..1e3d13ec8 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -11,7 +11,8 @@ #include <emmintrin.h> // SSE2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" @@ -29,233 +30,6 @@ #define SUB_EPI16 _mm_sub_epi16 #endif -void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { - // This 2D transform implements 4 vertical 1D transforms followed - // by 4 horizontal 1D transforms. The multiplies and adds are as given - // by Chen, Smith and Fralick ('77). The commands for moving the data - // around have been minimized by hand. - // For the purposes of the comments, the 16 inputs are referred to at i0 - // through iF (in raster order), intermediate variables are a0, b0, c0 - // through f, and correspond to the in-place computations mapped to input - // locations. The outputs, o0 through oF are labeled according to the - // output locations. - - // Constants - // These are the coefficients used for the multiplies. - // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), - // where cospi_N_64 = cos(N pi /64) - const __m128i k__cospi_A = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_B = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_C = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); - const __m128i k__cospi_D = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); - const __m128i k__cospi_E = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_F = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_G = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); - const __m128i k__cospi_H = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); - - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // This second rounding constant saves doing some extra adds at the end - const __m128i k__DCT_CONST_ROUNDING2 = - _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); - const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i in0, in1; -#if DCT_HIGH_BIT_DEPTH - __m128i cmp0, cmp1; - int test, overflow; -#endif - - // Load inputs. - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); -#if DCT_HIGH_BIT_DEPTH - // Check inputs small enough to use optimised code - cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); - cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); - test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); - if (test) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // multiply by 16 to give some extra precision - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - // if (i == 0 && input[0]) input[0] += 1; - // add 1 to the upper left pixel if it is non-zero, which helps reduce - // the round-trip error - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - // There are 4 total stages, alternating between an add/subtract stage - // followed by an multiply-and-add stage. - { - // Stage 1: Add/subtract - - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] - const __m128i r0 = _mm_unpacklo_epi16(in0, in1); - const __m128i r1 = _mm_unpackhi_epi16(in0, in1); - // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] - // r1 = [iC i8 iD i9 iE iA iF iB] - const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); - const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); - // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] - // r3 = [iC i8 iD i9 iF iB iE iA] - - const __m128i t0 = _mm_add_epi16(r2, r3); - const __m128i t1 = _mm_sub_epi16(r2, r3); - // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] - // t1 = [aC a8 aD a9 aF aB aE aA] - - // Stage 2: multiply by constants (which gets us into 32 bits). - // The constants needed here are: - // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] - // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] - // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] - // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); - // Then add and right-shift to get back to 16-bit range - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // w0 = [b0 b1 b7 b6] - // w1 = [b8 b9 bF bE] - // w2 = [b4 b5 b3 b2] - // w3 = [bC bD bB bA] - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // x0 = [b0 b1 b7 b6 b8 b9 bF bE] - // x1 = [b4 b5 b3 b2 bC bD bB bA] - in0 = _mm_shuffle_epi32(x0, 0xD8); - in1 = _mm_shuffle_epi32(x1, 0x8D); - // in0 = [b0 b1 b8 b9 b7 b6 bF bE] - // in1 = [b3 b2 bB bA b4 b5 bC bD] - } - { - // vertical DCTs finished. Now we do the horizontal DCTs. - // Stage 3: Add/subtract - - // t0 = [c0 c1 c8 c9 c4 c5 cC cD] - // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] - const __m128i t0 = ADD_EPI16(in0, in1); - const __m128i t1 = SUB_EPI16(in0, in1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&t0, &t1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // Stage 4: multiply by constants (which gets us into 32 bits). - { - // The constants needed here are: - // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] - // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] - // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] - // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); - const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); - const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); - // Then add and right-shift to get back to 16-bit range - // but this combines the final right-shift as well to save operations - // This unusual rounding operations is to maintain bit-accurate - // compatibility with the c version of this function which has two - // rounding steps in a row. - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); - // w0 = [o0 o4 o8 oC] - // w1 = [o2 o6 oA oE] - // w2 = [o1 o5 o9 oD] - // w3 = [o3 o7 oB oF] - // remember the o's are numbered according to the correct output location - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // x0 = [o0 o4 o8 oC o2 o6 oA oE] - // x1 = [o1 o5 o9 oD o3 o7 oB oF] - const __m128i y0 = _mm_unpacklo_epi16(x0, x1); - const __m128i y1 = _mm_unpackhi_epi16(x0, x1); - // y0 = [o0 o1 o4 o5 o8 o9 oC oD] - // y1 = [o2 o3 o6 o7 oA oB oE oF] - in0 = _mm_unpacklo_epi32(y0, y1); - // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] - in1 = _mm_unpackhi_epi32(y0, y1); - // in1 = [o8 o9 oA oB oC oD oE oF] - } - } - } - // Post-condition (v + 1) >> 2 is now incorporated into previous - // add and right-shift commands. Only 2 store instructions needed - // because we are using the fact that 1/3 are stored just after 0/2. - storeu_output(&in0, output + 0 * 4); - storeu_output(&in1, output + 2 * 4); -} - void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int pass; // Constants @@ -566,449 +340,5 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { } } -void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[256]); - const int16_t *in = input; - int16_t *out0 = intermediate; - tran_low_t *out1 = output; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = ADD_EPI16(in00, in15); - input1 = ADD_EPI16(in01, in14); - input2 = ADD_EPI16(in02, in13); - input3 = ADD_EPI16(in03, in12); - input4 = ADD_EPI16(in04, in11); - input5 = ADD_EPI16(in05, in10); - input6 = ADD_EPI16(in06, in09); - input7 = ADD_EPI16(in07, in08); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, - &input4, &input5, &input6, &input7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Calculate input for the next 8 results. - { - step1_0 = SUB_EPI16(in07, in08); - step1_1 = SUB_EPI16(in06, in09); - step1_2 = SUB_EPI16(in05, in10); - step1_3 = SUB_EPI16(in04, in11); - step1_4 = SUB_EPI16(in03, in12); - step1_5 = SUB_EPI16(in02, in13); - step1_6 = SUB_EPI16(in01, in14); - step1_7 = SUB_EPI16(in00, in15); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = ADD_EPI16(input0, input7); - const __m128i q1 = ADD_EPI16(input1, input6); - const __m128i q2 = ADD_EPI16(input2, input5); - const __m128i q3 = ADD_EPI16(input3, input4); - const __m128i q4 = SUB_EPI16(input3, input4); - const __m128i q5 = SUB_EPI16(input2, input5); - const __m128i q6 = SUB_EPI16(input1, input6); - const __m128i q7 = SUB_EPI16(input0, input7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Work on first four results - { - // Add/subtract - const __m128i r0 = ADD_EPI16(q0, q3); - const __m128i r1 = ADD_EPI16(q1, q2); - const __m128i r2 = SUB_EPI16(q1, q2); - const __m128i r3 = SUB_EPI16(q0, q3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i r0 = - mult_round_shift(&d0, &d1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - const __m128i r1 = - mult_round_shift(&d0, &d1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&r0, &r1); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // Add/subtract - const __m128i x0 = ADD_EPI16(q4, r0); - const __m128i x1 = SUB_EPI16(q4, r0); - const __m128i x2 = SUB_EPI16(q7, r1); - const __m128i x3 = ADD_EPI16(q7, r1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&res02, &res14, &res10, &res06); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 3 - { - step3_0 = ADD_EPI16(step1_0, step2_3); - step3_1 = ADD_EPI16(step1_1, step2_2); - step3_2 = SUB_EPI16(step1_1, step2_2); - step3_3 = SUB_EPI16(step1_0, step2_3); - step3_4 = SUB_EPI16(step1_7, step2_4); - step3_5 = SUB_EPI16(step1_6, step2_5); - step3_6 = ADD_EPI16(step1_6, step2_5); - step3_7 = ADD_EPI16(step1_7, step2_4); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, - &step3_4, &step3_5, &step3_6, &step3_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 5 - { - step1_0 = ADD_EPI16(step3_0, step2_1); - step1_1 = SUB_EPI16(step3_0, step2_1); - step1_2 = ADD_EPI16(step3_3, step2_2); - step1_3 = SUB_EPI16(step3_3, step2_2); - step1_4 = SUB_EPI16(step3_4, step2_5); - step1_5 = ADD_EPI16(step3_4, step2_5); - step1_6 = SUB_EPI16(step3_7, step2_6); - step1_7 = ADD_EPI16(step3_7, step2_6); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Transpose the results, do it as two 8x8 transposes. - transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, - &res06, &res07, pass, out0, out1); - transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, - &res14, &res15, pass, out0 + 8, out1 + 8); - if (pass == 0) { - out0 += 8 * 16; - } else { - out1 += 8 * 16; - } - } - // Setup in/out for next pass. - in = intermediate; - } -} - #undef ADD_EPI16 #undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c index 657dcfa22..2d8f8f71e 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -11,40 +11,12 @@ #include <emmintrin.h> // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" -void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0, in1; - __m128i tmp; - const __m128i zero = _mm_setzero_si128(); - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); - - tmp = _mm_add_epi16(in0, in1); - in0 = _mm_unpacklo_epi16(zero, tmp); - in1 = _mm_unpackhi_epi16(zero, tmp); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(tmp, zero); - in1 = _mm_unpackhi_epi32(tmp, zero); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(tmp, 8); - - in1 = _mm_add_epi32(tmp, in0); - in0 = _mm_slli_epi32(in1, 1); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); -} - void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); @@ -86,47 +58,12 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { } #define DCT_HIGH_BIT_DEPTH 0 -#define FDCT4x4_2D aom_fdct4x4_sse2 #define FDCT8x8_2D aom_fdct8x8_sse2 -#define FDCT16x16_2D aom_fdct16x16_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" -#undef FDCT4x4_2D #undef FDCT8x8_2D -#undef FDCT16x16_2D -#define FDCT32x32_2D aom_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D aom_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION #undef DCT_HIGH_BIT_DEPTH - -#if CONFIG_HIGHBITDEPTH #define DCT_HIGH_BIT_DEPTH 1 -#define FDCT4x4_2D aom_highbd_fdct4x4_sse2 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2 -#define FDCT16x16_2D aom_highbd_fdct16x16_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT -#undef FDCT4x4_2D #undef FDCT8x8_2D -#undef FDCT16x16_2D - -#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D aom_highbd_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 58e8971dd..12ccf7f26 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -12,15 +12,10 @@ #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ #define AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#include "aom_dsp/x86/txfm_common_intrin.h" - #ifdef __cplusplus extern "C" { #endif -#define pair_set_epi32(a, b) \ - _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) - static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; buf0 = _mm_mul_epu32(a, b); @@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32( return res0 + res1; } -static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *zero) { - __m128i minus_one = _mm_set1_epi32(-1); - // Check for overflows - __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); - __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); - __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); - __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); - __m128i reg0_top_dwords = - _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg1_top_dwords = - _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg2_top_dwords = - _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg3_top_dwords = - _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); - __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); - __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); - __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); - __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); - __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); - int overflow_01 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); - int overflow_23 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); - return (overflow_01 + overflow_23); -} - -static INLINE int k_check_epi32_overflow_8( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_16( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - } - } - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_32( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, - const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, - const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, - const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, - const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, - const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, - preg27, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, - preg31, zero); - } - } - } - } - } - } - } - return overflow; -} - static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { if (sizeof(tran_low_t) == 4) { const __m128i zero = _mm_setzero_si128(); @@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } } -static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, - const __m128i *pmultiplier, - const __m128i *prounding, int shift) { - const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); - const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); - const __m128i v0 = _mm_add_epi32(u0, *prounding); - const __m128i v1 = _mm_add_epi32(u1, *prounding); - const __m128i w0 = _mm_srai_epi32(v0, shift); - const __m128i w1 = _mm_srai_epi32(v1, shift); - return _mm_packs_epi32(w0, w1); -} - -static INLINE void transpose_and_output8x8( - const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, - const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, - const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr, - tran_low_t *out1_ptr) { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); - const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); - const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); - const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); - const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); - const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); - const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); - const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (pass == 0) { - _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); - } else { - storeu_output(&tr2_0, (out1_ptr + 0 * 16)); - storeu_output(&tr2_1, (out1_ptr + 1 * 16)); - storeu_output(&tr2_2, (out1_ptr + 2 * 16)); - storeu_output(&tr2_3, (out1_ptr + 3 * 16)); - storeu_output(&tr2_4, (out1_ptr + 4 * 16)); - storeu_output(&tr2_5, (out1_ptr + 5 * 16)); - storeu_output(&tr2_6, (out1_ptr + 6 * 16)); - storeu_output(&tr2_7, (out1_ptr + 7 * 16)); - } -} - -void fdct32_8col(__m128i *in0, __m128i *in1); - #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 8fa1c04d0..c1fb259a1 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -13,10 +13,6 @@ %include "third_party/x86inc/x86inc.asm" -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - SECTION_RODATA pw_11585x2: times 8 dw 23170 @@ -32,106 +28,7 @@ TRANSFORM_COEFFS 15137, 6270 TRANSFORM_COEFFS 16069, 3196 TRANSFORM_COEFFS 9102, 13623 -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -; 1D forward 8x8 DCT transform -%macro FDCT8_1D 1 - SUM_SUB 0, 7, 9 - SUM_SUB 1, 6, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 3, 4, 9 - - SUM_SUB 0, 3, 9 - SUM_SUB 1, 2, 9 - SUM_SUB 6, 5, 9 -%if %1 == 0 - SUM_SUB 0, 1, 9 -%endif - - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 - - pmulhrsw m6, m12 - pmulhrsw m5, m12 -%if %1 == 0 - pmulhrsw m0, m12 - pmulhrsw m1, m12 -%else - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 - SWAP 0, 1 -%endif - - SUM_SUB 4, 5, 9 - SUM_SUB 7, 6, 9 - BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 - SWAP 1, 4 - SWAP 3, 6 -%endmacro - -%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 - psraw m%3, m%1, 15 - psraw m%4, m%2, 15 - psubw m%1, m%3 - psubw m%2, m%4 - psraw m%1, 1 - psraw m%2, 1 -%endmacro - %macro STORE_OUTPUT 2 ; index, result -%if CONFIG_HIGHBITDEPTH ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); @@ -144,16 +41,16 @@ SECTION .text punpckhwd m12, m11 mova [outputq + 4*%1 + 0], m%2 mova [outputq + 4*%1 + 16], m12 -%else - mova [outputq + 2*%1], m%2 -%endif %endmacro +SECTION .text + +%if ARCH_X86_64 INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -180,25 +77,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D 0 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - FDCT8_1D 1 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - DIVIDE_ROUND_2X 0, 1, 9, 10 - DIVIDE_ROUND_2X 2, 3, 9, 10 - DIVIDE_ROUND_2X 4, 5, 9, 10 - DIVIDE_ROUND_2X 6, 7, 9, 10 - - STORE_OUTPUT 0, 0 - STORE_OUTPUT 8, 1 - STORE_OUTPUT 16, 2 - STORE_OUTPUT 24, 3 - STORE_OUTPUT 32, 4 - STORE_OUTPUT 40, 5 - STORE_OUTPUT 48, 6 - STORE_OUTPUT 56, 7 + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + STORE_OUTPUT 0, 4 + STORE_OUTPUT 8, 5 + STORE_OUTPUT 16, 3 + STORE_OUTPUT 24, 0 + STORE_OUTPUT 32, 10 + STORE_OUTPUT 40, 9 + STORE_OUTPUT 48, 7 + STORE_OUTPUT 56, 2 RET %endif diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm index 60446b086..99f17ebdf 100644 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm @@ -13,6 +13,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, ; int ref_stride, ; unsigned char *src, diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c index a99c0b40e..2a018c1cf 100644 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c @@ -11,8 +11,9 @@ #include <assert.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index 133640eb7..e5e3238d5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -11,8 +11,11 @@ #include <immintrin.h> #include <string.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" // ----------------------------------------------------------------------------- // Copy and average @@ -100,103 +103,258 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, } } -void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int width, int h, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - (void)bd; +void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m256i s[8], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} - assert(width % 4 == 0); - if (width > 32) { // width = 64 - __m256i p0, p1, p2, p3, u0, u1, u2, u3; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); - p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); - src += src_stride; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); - u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); - u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); - _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); - _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 16) { // width = 32 - __m256i p0, p1, u0, u1; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - src += src_stride; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 8) { // width = 16 - __m256i p0, p1, u0, u1; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); - - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + dst_stride), - _mm256_avg_epu16(p1, u1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); - } else if (width > 4) { // width = 8 - __m128i p0, p1, u0, u1; - do { - p0 = _mm_loadu_si128((const __m128i *)src); - p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm_loadu_si128((const __m128i *)dst); - u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); - - _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); - _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); - } else { // width = 4 - __m128i p0, p1, u0, u1; - do { - p0 = _mm_loadl_epi64((const __m128i *)src); - p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm_loadl_epi64((const __m128i *)dst); - u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); - - _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); - _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); +void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[4], coeffs_x[4]; + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else { + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } + } } } +#define CONV8_ROUNDING_BITS (7) + // ----------------------------------------------------------------------------- // Horizontal and vertical filtering -#define CONV8_ROUNDING_BITS (7) - static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; @@ -817,250 +975,6 @@ static void aom_highbd_filter_block1d8_v2_avx2( } while (height > 0); } -// Calculation with averaging the input pixels - -static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, - uint16_t *dst) { - const __m128i a0 = _mm256_castsi256_si128(*y0); - const __m128i a1 = _mm256_extractf128_si256(*y0, 1); - __m128i res = _mm_packus_epi32(a0, a1); - const __m128i pix = _mm_loadu_si128((const __m128i *)dst); - res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); - res = _mm_avg_epu16(res, pix); - _mm_storeu_si128((__m128i *)dst, res); -} - -static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); - const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); - const __m256i pix = - _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); - a = _mm256_min_epi16(a, *mask); - a = _mm256_avg_epu16(a, pix); - _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); - _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); -} - -static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); - a = _mm256_min_epi16(a, *mask); - a = _mm256_avg_epu16(a, pix); - _mm256_storeu_si256((__m256i *)dst, a); -} - -static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); - const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); - __m256i p = _mm256_min_epi16(*y0, *mask); - p = _mm256_avg_epu16(p, pix0); - _mm256_storeu_si256((__m256i *)dst, p); - - p = _mm256_min_epi16(*y1, *mask); - p = _mm256_avg_epu16(p, pix1); - _mm256_storeu_si256((__m256i *)(dst + pitch), p); -} - -static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, - const __m128i *y1, - const __m128i *mask, - uint16_t *dst) { - __m128i res = _mm_packus_epi32(*y0, *y1); - const __m128i pix = _mm_loadu_si128((const __m128i *)dst); - res = _mm_min_epi16(res, *mask); - res = _mm_avg_epu16(res, pix); - _mm_storeu_si128((__m128i *)dst, res); -} - -static void aom_highbd_filter_block1d8_h8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_8x2_pixels(src_ptr, src_pitch, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - store_8x1_avg_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_16x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_v8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[9], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_8x9_init(src_ptr, src_pitch, signal); - - do { - pack_8x9_pixels(src_ptr, src_pitch, signal); - - filter_8x9_pixels(signal, ff, &res0, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -static void aom_highbd_filter_block1d16_v8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[17], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_16x9_init(src_ptr, src_pitch, signal); - - do { - pack_16x9_pixels(src_ptr, src_pitch, signal); - filter_16x9_pixels(signal, ff, &res0, &res1); - store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_16x9_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_h2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_8x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_2t_pixels(src_ptr, signal); - filter_8x1_2t_pixels(signal, &ff, &res0); - store_8x1_avg_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_16x1_2t_pixels(src_ptr, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -static void aom_highbd_filter_block1d16_v2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[3], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - __m256i ff; - - pack_2t_filter(filter, &ff); - pack_16x2_init(src_ptr, signal); - - do { - pack_16x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16x2_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_v2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m128i signal[3], res0, res1; - const __m128i max = _mm_set1_epi16((1 << bd) - 1); - __m128i ff; - - pack_8x1_2t_filter(filter, &ff); - pack_8x2_init(src_ptr, signal); - - do { - pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); - filter_8_2t_pixels(signal, &ff, &res0, &res1); - store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); @@ -1080,32 +994,5 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); -HIGH_FUN_CONV_2D(, avx2); - -void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -#define aom_highbd_filter_block1d4_h8_avg_avx2 \ - aom_highbd_filter_block1d4_h8_avg_sse2 -#define aom_highbd_filter_block1d4_h2_avg_avx2 \ - aom_highbd_filter_block1d4_h2_avg_sse2 -#define aom_highbd_filter_block1d4_v8_avg_avx2 \ - aom_highbd_filter_block1d4_v8_avg_sse2 -#define aom_highbd_filter_block1d4_v2_avg_avx2 \ - aom_highbd_filter_block1d4_v2_avg_sse2 - -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - avx2); -HIGH_FUN_CONV_2D(avg_, avx2); #undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c new file mode 100644 index 000000000..f7ac9b496 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> +#include <assert.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m128i s[16], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((uint32_t *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m128i s[4], coeffs_x[4]; + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c deleted file mode 100644 index e001a1d70..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "aom_ports/msvc.h" -#include "./aom_dsp_rtcd.h" - -// ----------------------------------------------------------------------------- -// D45E_PRED -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y, - const __m256i *z) { - const __m256i one = _mm256_set1_epi16(1); - const __m256i a = _mm256_avg_epu16(*x, *z); - const __m256i b = - _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one)); - return _mm256_avg_epu16(b, *y); -} - -static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1, - const __m256i *a2, uint16_t **dst, - ptrdiff_t stride) { - const __m256i y = avg3_epu16(a0, a1, a2); - _mm256_storeu_si256((__m256i *)*dst, y); - *dst += stride; -} - -void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 9); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 9)); - x0 = _mm256_insert_epi16(x0, above[23], 15); - const __m256i y = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 15); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); - x2 = _mm256_insert_epi16(x2, above[31], 15); - const __m256i y = avg3_epu16(&x0, &x1, &x2); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 33); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); - x0 = _mm256_insert_epi16(x0, above[47], 15); - const __m256i y = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); - __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); - - uint16_t *dst1 = dst; - uint16_t *dst2 = dst + 16; - - d45e_w16(&x0, &x1, &x2, &dst1, stride); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x0, &x1, &x2, &dst1, stride); - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - } while (i < 15); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i u = avg3_epu16(&x0, &x1, &x2); - _mm256_storeu_si256((__m256i *)dst1, u); - - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17)); - y2 = _mm256_insert_epi16(y2, above[47], 15); - u = avg3_epu16(&y0, &y1, &y2); - _mm256_storeu_si256((__m256i *)dst2, u); -} - -void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); - __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); - - uint16_t *dst1 = dst; - uint16_t *dst2 = dst + 16; - - d45e_w16(&x0, &x1, &x2, &dst1, stride); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x0, &x1, &x2, &dst1, stride); - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - } while (i < 33); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); - __m256i u = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst1, u); - - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33)); - y0 = _mm256_insert_epi16(y0, above[63], 15); - u = avg3_epu16(&y1, &y2, &y0); - _mm256_storeu_si256((__m256i *)dst2, u); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c index 691e166cf..5a55736c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -11,7 +11,7 @@ #include <emmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- // H_PRED @@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, dst += stride; } } - -// ----------------------------------------------------------------------------- -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, - const __m128i *z) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a = _mm_avg_epu16(*x, *z); - const __m128i b = - _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); - return _mm_avg_epu16(b, *y); -} - -void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); - const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); - const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); - const __m128i row0 = _mm_srli_si128(avg2, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg2, 4); - const __m128i row3 = _mm_srli_si128(avg3, 2); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - - dst -= stride; - dst[0] = _mm_extract_epi16(avg3, 1); - dst[stride] = _mm_extract_epi16(avg3, 0); -} - -void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); - const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); - const __m128i row0 = _mm_srli_si128(avg3, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg3, 2); - const __m128i row3 = avg3; - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); - const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); - const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); - const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); - const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); - const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); - const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); - const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); - const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); - const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); - const __m128i row2 = _mm_srli_si128(row3, 4); - const __m128i row1 = _mm_srli_si128(row3, 8); - const __m128i row0 = _mm_srli_si128(avg3, 4); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst[0] = _mm_extract_epi16(avg2, 3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); - const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); - __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); - CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); - const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); - (void)left; - (void)bd; - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); -} - -void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i h76543210 = _mm_load_si128((const __m128i *)above); - __m128i hx7654321 = _mm_srli_si128(h76543210, 2); - __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); - __m128i hx8765432 = _mm_srli_si128(h87654321, 2); - __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); - __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); - dst += stride; - - // hcba98765 - h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); - h76543210 = _mm_insert_epi16(h76543210, above[11], 7); - // hxcba9876 - hx7654321 = _mm_srli_si128(h76543210, 2); - // hxxcba987 - hx8765432 = _mm_srli_si128(h76543210, 4); - avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); -} - -void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - __m128i y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x0 = _mm_loadu_si128((const __m128i *)(above + 3)); - y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 4)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 5)); - x2 = _mm_insert_epi16(x2, above[11], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} - -static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, - const __m128i *a2, uint16_t **dst, - ptrdiff_t stride) { - const __m128i y = avg3_epu16(a0, a1, a2); - _mm_storeu_si128((__m128i *)*dst, y); - *dst += stride; -} - -void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 9); - - x0 = _mm_loadu_si128((const __m128i *)(above + 9)); - x0 = _mm_insert_epi16(x0, above[15], 7); - const __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); -} - -void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 15); - - x0 = _mm_loadu_si128((const __m128i *)(above + 15)); - __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 16)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 17)); - x2 = _mm_insert_epi16(x2, above[23], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c deleted file mode 100644 index b089a3f43..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c +++ /dev/null @@ -1,521 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "./aom_dsp_rtcd.h" - -// ----------------------------------------------------------------------------- -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, - const __m128i *z) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a = _mm_avg_epu16(*x, *z); - const __m128i b = - _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); - return _mm_avg_epu16(b, *y); -} - -DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 -}; - -static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { - *a = _mm_shuffle_epi8(*a, *rotrw); - return *a; -} - -void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i IXABCDEF = - _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); - const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); - const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); - __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); - __m128i rowa = avg2; - __m128i rowb = avg3; - int i; - (void)bd; - for (i = 0; i < 8; i += 2) { - _mm_store_si128((__m128i *)dst, rowa); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb); - dst += stride; - rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); - rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); - } -} - -void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A0 = _mm_load_si128((const __m128i *)above); - const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i avg2_0 = _mm_avg_epu16(A0, B0); - const __m128i avg2_1 = _mm_avg_epu16(A1, B1); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); - const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); - const __m128i L1_ = _mm_srli_si128(L1, 2); - __m128i rowa_0 = avg2_0; - __m128i rowa_1 = avg2_1; - __m128i rowb_0 = avg3_0; - __m128i rowb_1 = avg3_1; - __m128i avg3_left[2]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); - avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); - for (i = 0; i < 2; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; j += 2) { - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb_0); - _mm_store_si128((__m128i *)(dst + 8), rowb_1); - dst += stride; - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); - rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); - } - } -} - -void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_load_si128((const __m128i *)above); - const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); - const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i avg2_0 = _mm_avg_epu16(A0, B0); - const __m128i avg2_1 = _mm_avg_epu16(A1, B1); - const __m128i avg2_2 = _mm_avg_epu16(A2, B2); - const __m128i avg2_3 = _mm_avg_epu16(A3, B3); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); - const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); - const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); - const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); - const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); - const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); - const __m128i L3_ = _mm_srli_si128(L3, 2); - __m128i rowa_0 = avg2_0; - __m128i rowa_1 = avg2_1; - __m128i rowa_2 = avg2_2; - __m128i rowa_3 = avg2_3; - __m128i rowb_0 = avg3_0; - __m128i rowb_1 = avg3_1; - __m128i rowb_2 = avg3_2; - __m128i rowb_3 = avg3_3; - __m128i avg3_left[4]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); - avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); - avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); - avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); - for (i = 0; i < 4; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; j += 2) { - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - _mm_store_si128((__m128i *)(dst + 16), rowa_2); - _mm_store_si128((__m128i *)(dst + 24), rowa_3); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb_0); - _mm_store_si128((__m128i *)(dst + 8), rowb_1); - _mm_store_si128((__m128i *)(dst + 16), rowb_2); - _mm_store_si128((__m128i *)(dst + 24), rowb_3); - dst += stride; - rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); - rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); - rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); - rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); - rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); - } - } -} - -void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); - const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i AXIJKLMN = - _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); - const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); - __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); - __m128i rowa = avg3; - int i; - (void)bd; - for (i = 0; i < 8; ++i) { - rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa); - dst += stride; - } -} - -void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i B0 = _mm_load_si128((const __m128i *)above); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); - const __m128i C1 = _mm_srli_si128(B1, 2); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); - const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); - __m128i rowa_0 = avg3_0; - __m128i rowa_1 = avg3_1; - __m128i avg3_left[2]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); - avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); - for (i = 0; i < 2; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; ++j) { - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - dst += stride; - } - } -} - -void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i B0 = _mm_load_si128((const __m128i *)above); - const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); - const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); - const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); - const __m128i C3 = _mm_srli_si128(B3, 2); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); - const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); - const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); - const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); - __m128i rowa_0 = avg3_0; - __m128i rowa_1 = avg3_1; - __m128i rowa_2 = avg3_2; - __m128i rowa_3 = avg3_3; - __m128i avg3_left[4]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); - avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); - avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); - avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); - for (i = 0; i < 4; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; ++j) { - rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); - rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - _mm_store_si128((__m128i *)(dst + 16), rowa_2); - _mm_store_si128((__m128i *)(dst + 24), rowa_3); - dst += stride; - } - } -} - -void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); - const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); - const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i AXIJKLMN = - _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); - const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); - const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); - const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); - const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); - const __m128i row0 = - _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); - const __m128i row1 = - _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); - const __m128i row2 = - _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); - const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); - const __m128i row4 = - _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); - const __m128i row5 = - _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); - const __m128i row6 = - _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); - const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); - (void)bd; - _mm_store_si128((__m128i *)dst, row0); - dst += stride; - _mm_store_si128((__m128i *)dst, row1); - dst += stride; - _mm_store_si128((__m128i *)dst, row2); - dst += stride; - _mm_store_si128((__m128i *)dst, row3); - dst += stride; - _mm_store_si128((__m128i *)dst, row4); - dst += stride; - _mm_store_si128((__m128i *)dst, row5); - dst += stride; - _mm_store_si128((__m128i *)dst, row6); - dst += stride; - _mm_store_si128((__m128i *)dst, row7); -} - -void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); - const __m128i B1 = _mm_srli_si128(A1, 2); - const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); - const __m128i C1 = _mm_srli_si128(A1, 4); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); - const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); - const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); - const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); - const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); - __m128i row_0 = avg3_0; - __m128i row_1 = avg3_1; - __m128i avg2_avg3_left[2][2]; - int i, j; - (void)bd; - - avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); - - for (j = 0; j < 2; ++j) { - for (i = 0; i < 2; ++i) { - const __m128i avg2_avg3 = avg2_avg3_left[j][i]; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - } - } -} - -void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); - const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); - const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); - const __m128i B3 = _mm_srli_si128(A3, 2); - const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); - const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); - const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); - const __m128i C3 = _mm_srli_si128(A3, 4); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); - const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); - const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); - const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); - const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); - const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); - const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); - const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); - const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); - const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); - const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); - const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); - __m128i row_0 = avg3_0; - __m128i row_1 = avg3_1; - __m128i row_2 = avg3_2; - __m128i row_3 = avg3_3; - __m128i avg2_avg3_left[4][2]; - int i, j; - (void)bd; - - avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); - avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); - avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); - avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); - - for (j = 0; j < 4; ++j) { - for (i = 0; i < 2; ++i) { - const __m128i avg2_avg3 = avg2_avg3_left[j][i]; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - } - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c index 94c68885c..c954da94e 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -11,210 +11,26 @@ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/common_avx2.h" #include "aom_dsp/x86/lpf_common_sse2.h" #include "aom/aom_integer.h" -#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 -static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, - const uint8_t *t, int bd, __m256i *blt, - __m256i *lt, __m256i *thr) { - const int shift = bd - 8; - const __m128i zero = _mm_setzero_si128(); - - __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); - __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *blt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *lt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *thr = _mm256_slli_epi16(y, shift); -} - -static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, - __m256i *p, __m256i *q) { - int i; - for (i = 0; i < size; i++) { - p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); - q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); - } -} - -static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, - const __m256i *t, __m256i *hev) { - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); - __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); - h = _mm256_subs_epu16(h, *t); - - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - const __m256i zero = _mm256_setzero_si256(); - *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); -} - -static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, - const __m256i *l, const __m256i *bl, - __m256i *mask) { - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); - max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); - max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); - - int i; - for (i = 1; i < 4; ++i) { - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); - } - max = _mm256_subs_epu16(max, *l); - *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask -} - -static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, - const __m256i *q, int bd, int start, - int end, __m256i *flat) { - __m256i max = _mm256_setzero_si256(); - int i; - for (i = start; i < end; ++i) { - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); - } - - __m256i ft; - if (bd == 8) - ft = _mm256_subs_epu16(max, *th); - else if (bd == 10) - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); - else // bd == 12 - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); - - const __m256i zero = _mm256_setzero_si256(); - *flat = _mm256_cmpeq_epi16(ft, zero); -} - -// Note: -// Access p[3-1], p[0], and q[3-1], q[0] -static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - // check the distance 1,2,3 against 0 - flat_mask_internal(th, p, q, bd, 1, 4, flat); -} - -// Note: -// access p[7-4], p[0], and q[7-4], q[0] -static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - flat_mask_internal(th, p, q, bd, 4, 8, flat); -} - -static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, - __m256i *pixel) { - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(*pixel, *max); - clamped = _mm256_andnot_si256(mask, *pixel); - mask = _mm256_and_si256(mask, *max); - clamped = _mm256_or_si256(mask, clamped); - - mask = _mm256_cmpgt_epi16(clamped, *min); - clamped = _mm256_and_si256(mask, clamped); - mask = _mm256_andnot_si256(mask, *min); - *pixel = _mm256_or_si256(clamped, mask); -} - -static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, - const __m256i *th, int bd, __m256i *ps, - __m256i *qs) { - __m256i t80; - if (bd == 8) - t80 = _mm256_set1_epi16(0x80); - else if (bd == 10) - t80 = _mm256_set1_epi16(0x200); - else // bd == 12 - t80 = _mm256_set1_epi16(0x800); - - __m256i ps0 = _mm256_subs_epi16(p[0], t80); - __m256i ps1 = _mm256_subs_epi16(p[1], t80); - __m256i qs0 = _mm256_subs_epi16(q[0], t80); - __m256i qs1 = _mm256_subs_epi16(q[1], t80); - - const __m256i one = _mm256_set1_epi16(1); - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i zero = _mm256_setzero_si256(); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filter = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filter); - - __m256i hev; - highbd_hev_mask(p, q, th, &hev); - filter = _mm256_and_si256(filter, hev); - - const __m256i x = _mm256_subs_epi16(qs0, ps0); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - pixel_clamp(&pmin, &pmax, &filter); - filter = _mm256_and_si256(filter, *mask); - - const __m256i t3 = _mm256_set1_epi16(3); - const __m256i t4 = _mm256_set1_epi16(4); - - __m256i filter1 = _mm256_adds_epi16(filter, t4); - __m256i filter2 = _mm256_adds_epi16(filter, t3); - pixel_clamp(&pmin, &pmax, &filter1); - pixel_clamp(&pmin, &pmax, &filter2); - filter1 = _mm256_srai_epi16(filter1, 3); - filter2 = _mm256_srai_epi16(filter2, 3); - - qs0 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &qs0); - ps0 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &ps0); - - qs[0] = _mm256_adds_epi16(qs0, t80); - ps[0] = _mm256_adds_epi16(ps0, t80); - - filter = _mm256_adds_epi16(filter1, one); - filter = _mm256_srai_epi16(filter, 1); - filter = _mm256_andnot_si256(hev, filter); - - qs1 = _mm256_subs_epi16(qs1, filter); - pixel_clamp(&pmin, &pmax, &qs1); - ps1 = _mm256_adds_epi16(ps1, filter); - pixel_clamp(&pmin, &pmax, &ps1); - - qs[1] = _mm256_adds_epi16(qs1, t80); - ps[1] = _mm256_adds_epi16(ps1, t80); -} -#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 - -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 -void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +void aom_highbd_lpf_horizontal_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, + blimit1, limit1, thresh1, bd); } -void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, - const uint8_t *blt, const uint8_t *lt, - const uint8_t *thr, int bd) { - aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +void aom_highbd_lpf_vertical_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); } void aom_highbd_lpf_horizontal_4_dual_avx2( @@ -248,626 +64,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2( aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } -#else -void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - __m256i blimit, limit, thresh; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - - __m256i p[8], q[8]; - load_highbd_pixel(s, 8, pitch, p, q); - - __m256i mask; - highbd_filter_mask(p, q, &limit, &blimit, &mask); - - __m256i flat, flat2; - const __m256i one = _mm256_set1_epi16(1); - highbd_flat_mask4(&one, p, q, &flat, bd); - highbd_flat_mask5(&one, p, q, &flat2, bd); - - flat = _mm256_and_si256(flat, mask); - flat2 = _mm256_and_si256(flat2, flat); - - __m256i ps[2], qs[2]; - highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - - // flat and wide flat calculations - __m256i flat_p[3], flat_q[3]; - __m256i flat2_p[7], flat2_q[7]; - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - - __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), - _mm256_add_epi16(p[4], p[3])); - __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), - _mm256_add_epi16(q[4], q[3])); - - __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); - sum_p = _mm256_add_epi16(sum_p, sum_lp); - - __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); - sum_q = _mm256_add_epi16(sum_q, sum_lq); - sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); - sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); - - flat2_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); - flat2_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); - flat_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); - flat_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); - - __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); - __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); - __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); - __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); - - sum_q = _mm256_sub_epi16(sum_p, p[6]); - sum_p = _mm256_sub_epi16(sum_p, q[6]); - flat2_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); - flat2_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); - - sum_lq = _mm256_sub_epi16(sum_lp, p[2]); - sum_lp = _mm256_sub_epi16(sum_lp, q[2]); - flat_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); - flat_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); - - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p3 = _mm256_add_epi16(sum_p3, p[3]); - sum_q3 = _mm256_add_epi16(sum_q3, q[3]); - - sum_p = _mm256_sub_epi16(sum_p, q[5]); - sum_q = _mm256_sub_epi16(sum_q, p[5]); - flat2_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); - flat2_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); - - sum_lp = _mm256_sub_epi16(sum_lp, q[1]); - sum_lq = _mm256_sub_epi16(sum_lq, p[1]); - flat_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); - flat_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); - - int i; - for (i = 3; i < 7; ++i) { - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); - sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); - flat2_p[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); - flat2_q[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); - } - } - - // highbd_filter8 - p[2] = _mm256_andnot_si256(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm256_and_si256(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm256_andnot_si256(flat, q[2]); - flat_q[2] = _mm256_and_si256(flat, flat_q[2]); - q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values - - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm256_andnot_si256(flat, ps[i]); - flat_p[i] = _mm256_and_si256(flat, flat_p[i]); - p[i] = _mm256_or_si256(ps[i], flat_p[i]); - qs[i] = _mm256_andnot_si256(flat, qs[i]); - flat_q[i] = _mm256_and_si256(flat, flat_q[i]); - q[i] = _mm256_or_si256(qs[i], flat_q[i]); - } - - // highbd_filter16 - - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm256_andnot_si256(flat2, p[i]); - flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm256_andnot_si256(flat2, q[i]); - flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); - q[i] = _mm256_or_si256(q[i], flat2_q[i]); - _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); - _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); - } -} - -static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, - uint16_t *dst, int dst_p) { - __m256i x[16]; - int i; - for (i = 0; i < 16; ++i) { - x[i] = _mm256_loadu_si256((const __m256i *)src); - src += src_p; - } - mm256_transpose_16x16(x, x); - for (i = 0; i < 16; ++i) { - _mm256_storeu_si256((__m256i *)dst, x[i]); - dst += dst_p; - } -} - -void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[256]); - - // Transpose 16x16 - highbd_transpose16x16(s - 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); - - // Transpose back - highbd_transpose16x16(t_dst, 16, s - 8, p); -} - -static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, - const uint8_t *t0, const uint8_t *b1, - const uint8_t *l1, const uint8_t *t1, int bd, - __m256i *blt, __m256i *lt, __m256i *thr) { - const __m128i z128 = _mm_setzero_si128(); - const __m128i blimit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); - const __m128i limit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); - const __m128i thresh0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); - const __m128i blimit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); - const __m128i limit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); - const __m128i thresh1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); - - *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); - *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); - *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); - - int shift = bd - 8; - *blt = _mm256_slli_epi16(*blt, shift); - *lt = _mm256_slli_epi16(*lt, shift); - *thr = _mm256_slli_epi16(*thr, shift); -} - -void aom_highbd_lpf_horizontal_4_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80, tff80, tffe0, t1f, t7f; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - tff80 = _mm256_set1_epi16(0xff80); - tffe0 = _mm256_set1_epi16(0xffe0); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); - } else if (bd == 10) { - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); - } else { // bd == 12 - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); - } - - __m256i ps1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); - __m256i ps0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); - __m256i qs0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); - __m256i qs1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); - - // filter_mask and hev_mask - const __m256i zero = _mm256_setzero_si256(); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - __m256i work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), - _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), - _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // filter4 - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - __m256i work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); - - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - __m256i filter1 = _mm256_adds_epi16(filt, t4); - pixel_clamp(&pmin, &pmax, &filter1); - __m256i filter2 = _mm256_adds_epi16(filt, t3); - pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 - filter1 = _mm256_srli_epi16(filter1, 3); - work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm256_and_si256(filter1, t1f); // clamp the range - filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits - - // Filter2 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter2); - filter2 = _mm256_srli_epi16(filter2, 3); - work_a = _mm256_and_si256(work_a, tffe0); - filter2 = _mm256_and_si256(filter2, t1f); - filter2 = _mm256_or_si256(filter2, work_a); - - // filt >> 1 - // equivalent to shifting 0x1f left by bitdepth - 8 - // and setting new bits to 1 - filt = _mm256_adds_epi16(filter1, one); - work_a = _mm256_cmpgt_epi16(zero, filt); - filt = _mm256_srli_epi16(filt, 1); - work_a = _mm256_and_si256(work_a, tff80); - filt = _mm256_and_si256(filt, t7f); - filt = _mm256_or_si256(filt, work_a); - - filt = _mm256_andnot_si256(hev, filt); - - filter1 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &filter1); - q0 = _mm256_adds_epi16(filter1, t80); - - filter1 = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &filter1); - q1 = _mm256_adds_epi16(filter1, t80); - - filter2 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &filter2); - p0 = _mm256_adds_epi16(filter2, t80); - - filter2 = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &filter2); - p1 = _mm256_adds_epi16(filter2, t80); - - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); -} - -void aom_highbd_lpf_horizontal_8_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - } else if (bd == 10) { - t80 = _mm256_set1_epi16(0x200); - } else { // bd == 12 - t80 = _mm256_set1_epi16(0x800); - } - - __m256i ps1, ps0, qs0, qs1; - ps1 = _mm256_subs_epi16(p1, t80); - ps0 = _mm256_subs_epi16(p0, t80); - qs0 = _mm256_subs_epi16(q0, t80); - qs1 = _mm256_subs_epi16(q1, t80); - - // filter_mask and hev_mask - __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i zero = _mm256_set1_epi16(0); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - mask = _mm256_max_epi16(abs_q1q0, mask); - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // flat_mask4 - flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); - flat = _mm256_max_epi16(work, flat); - flat = _mm256_max_epi16(abs_p1p0, flat); - flat = _mm256_max_epi16(abs_q1q0, flat); - - if (bd == 8) - flat = _mm256_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); - - flat = _mm256_cmpeq_epi16(flat, zero); - flat = _mm256_and_si256(flat, mask); // flat & mask - - // Added before shift for rounding part of ROUND_POWER_OF_TWO - __m256i workp_a, workp_b, workp_shft; - workp_a = - _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); - const __m256i four = _mm256_set1_epi16(4); - workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); - - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); - - // lp filter - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt, filter1, filter2, work_a; - filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - filter1 = _mm256_adds_epi16(filt, t4); - filter2 = _mm256_adds_epi16(filt, t3); - - // Filter1 >> 3 - pixel_clamp(&pmin, &pmax, &filter1); - filter1 = _mm256_srai_epi16(filter1, 3); - - // Filter2 >> 3 - pixel_clamp(&pmin, &pmax, &filter2); - filter2 = _mm256_srai_epi16(filter2, 3); - - // filt >> 1 - filt = _mm256_adds_epi16(filter1, one); - filt = _mm256_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - filt = _mm256_andnot_si256(hev, filt); - - work_a = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q0 = _mm256_loadu_si256((__m256i *)flat_oq0); - work_a = _mm256_andnot_si256(flat, work_a); - q0 = _mm256_and_si256(flat, q0); - q0 = _mm256_or_si256(work_a, q0); - - work_a = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q1 = _mm256_loadu_si256((__m256i *)flat_oq1); - work_a = _mm256_andnot_si256(flat, work_a); - q1 = _mm256_and_si256(flat, q1); - q1 = _mm256_or_si256(work_a, q1); - - work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - q2 = _mm256_loadu_si256((__m256i *)flat_oq2); - work_a = _mm256_andnot_si256(flat, work_a); - q2 = _mm256_and_si256(flat, q2); - q2 = _mm256_or_si256(work_a, q2); - - work_a = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p0 = _mm256_loadu_si256((__m256i *)flat_op0); - work_a = _mm256_andnot_si256(flat, work_a); - p0 = _mm256_and_si256(flat, p0); - p0 = _mm256_or_si256(work_a, p0); - - work_a = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p1 = _mm256_loadu_si256((__m256i *)flat_op1); - work_a = _mm256_andnot_si256(flat, work_a); - p1 = _mm256_and_si256(flat, p1); - p1 = _mm256_or_si256(work_a, p1); - - work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - p2 = _mm256_loadu_si256((__m256i *)flat_op2); - work_a = _mm256_andnot_si256(flat, work_a); - p2 = _mm256_and_si256(flat, p2); - p2 = _mm256_or_si256(work_a, p2); - - _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); - _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); -} - -void aom_highbd_lpf_vertical_4_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} - -void aom_highbd_lpf_vertical_8_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} -#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 0a399edf2..83e0098ba 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -11,29 +11,23 @@ #include <emmintrin.h> // SSE2 -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/lpf_common_sse2.h" -#include "aom_ports/emmintrin_compat.h" -#include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" -static INLINE void pixel_clamp(const __m128i *min, const __m128i *max, - __m128i *pixel) { - __m128i clamped, mask; +#include "aom_dsp/x86/lpf_common_sse2.h" - mask = _mm_cmpgt_epi16(*pixel, *max); - clamped = _mm_andnot_si128(mask, *pixel); - mask = _mm_and_si128(mask, *max); - clamped = _mm_or_si128(mask, clamped); +static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + *pixel = _mm_min_epi16(*pixel, *max); + *pixel = _mm_max_epi16(*pixel, *min); +} - mask = _mm_cmpgt_epi16(clamped, *min); - clamped = _mm_and_si128(mask, clamped); - mask = _mm_andnot_si128(mask, *min); - *pixel = _mm_or_si128(clamped, mask); +static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); } static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, const uint8_t *t, int bd, __m128i *blt, - __m128i *lt, __m128i *thr) { + __m128i *lt, __m128i *thr, __m128i *t80_out) { const int shift = bd - 8; const __m128i zero = _mm_setzero_si128(); @@ -45,6 +39,36 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); *thr = _mm_slli_epi16(x, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void get_limit_dual( + const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, + const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, + int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, + __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); + __m128i x1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *blt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *lt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *thr_out = _mm_slli_epi16(x0, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); } static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, @@ -55,115 +79,217 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } } -// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); -static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, - const __m128i *t, __m128i *hev) { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); - __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); - h = _mm_subs_epu16(h, *t); - const __m128i ffff = _mm_set1_epi16(0xFFFF); - const __m128i zero = _mm_setzero_si128(); - *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); -} - -static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, - const __m128i *l, const __m128i *bl, - __m128i *mask) { - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); +static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = abs_diff16(p[0], q[0]); + __m128i abs_p1q1 = abs_diff16(p[1], q[1]); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); int i; for (i = 1; i < 4; ++i) { - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), - _mm_subs_epu16(p[i - 1], p[i]))); - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), - _mm_subs_epu16(q[i - 1], q[i]))); + max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); } max = _mm_subs_epu16(max, *l); *mask = _mm_cmpeq_epi16(max, zero); // return ~mask } -static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, - const __m128i *q, int bd, int start, - int end, __m128i *flat) { - __m128i max = _mm_setzero_si128(); +static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, + __m128i *p1p0, __m128i *q1q0, + __m128i *abs_p1p0, __m128i *l, + __m128i *bl, __m128i *t, + __m128i *hev, __m128i *mask) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; + __m128i max, max01, h; + + *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); + *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); + + abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 + + max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + *abs_p1p0 = abs_diff16(pq[0], pq[1]); + abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); + max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); + // mask |= (abs(*p1 - *p0) > limit) * -1; + // mask |= (abs(*q1 - *q0) > limit) * -1; + h = _mm_subs_epu16(max01, *t); + + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + // replicate for the further "merged variables" usage + *hev = _mm_unpacklo_epi64(*hev, *hev); + + max = _mm_max_epi16(max, max01); + int i; + for (i = 2; i < x; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, + int start, int end, __m128i *flat) { + int i; + __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), + abs_diff16(pq[start + 1], pq[0])); + + for (i = start + 2; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, + const __m128i *q, int start, int end, + __m128i *flat) { int i; - for (i = start; i < end; ++i) { - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), - _mm_subs_epu16(p[0], p[i]))); - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), - _mm_subs_epu16(q[0], q[i]))); + __m128i max = + _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); + + for (i = start + 1; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); } __m128i ft; - if (bd == 8) - ft = _mm_subs_epu16(max, *th); - else if (bd == 10) - ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); - else // bd == 12 - ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); + ft = _mm_subs_epu16(max, *th); const __m128i zero = _mm_setzero_si128(); *flat = _mm_cmpeq_epi16(ft, zero); } -// Note: -// Access p[3-1], p[0], and q[3-1], q[0] -static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, - const __m128i *q, __m128i *flat, int bd) { +static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, + __m128i *flat2, int bd) { // check the distance 1,2,3 against 0 - flat_mask_internal(th, p, q, bd, 1, 4, flat); + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal(&th, pq, 1, 4, flat); + flat_mask_internal(&th, pq, 4, 7, flat2); } -// Note: -// access p[7-4], p[0], and q[7-4], q[0] -static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, - const __m128i *q, __m128i *flat, int bd) { - flat_mask_internal(th, p, q, bd, 4, 8, flat); +static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, + const __m128i *q, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal_dual(&th, p, q, 1, 4, flat); + flat_mask_internal_dual(&th, p, q, 4, 7, flat2); } -static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, - const __m128i *th, int bd, __m128i *ps, - __m128i *qs) { - __m128i t80; - if (bd == 8) - t80 = _mm_set1_epi16(0x80); - else if (bd == 10) - t80 = _mm_set1_epi16(0x200); - else // bd == 12 - t80 = _mm_set1_epi16(0x800); - - __m128i ps0 = _mm_subs_epi16(p[0], t80); - __m128i ps1 = _mm_subs_epi16(p[1], t80); - __m128i qs0 = _mm_subs_epi16(q[0], t80); - __m128i qs1 = _mm_subs_epi16(q[1], t80); +static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0, __m128i *t80, + int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + + const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); + __m128i ps1ps0_work, qs1qs0_work, work; + __m128i filt, filter2filter1, filter2filt, filter1filt; + + ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); + qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); + + work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); + pixel_clamp(&pmin, &pmax, &work); + filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, *mask); + filt = _mm_unpacklo_epi64(filt, filt); + + filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ + pixel_clamp(&pmin, &pmax, &filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ + filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); + + // filt >> 1 + filt = _mm_adds_epi16(filt, one); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(*hev, filt); + + filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); + filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); + + qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); + ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); + + pixel_clamp(&pmin, &pmax, &qs1qs0_work); + pixel_clamp(&pmin, &pmax, &ps1ps0_work); + + *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); + *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); +} + +static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, + __m128i *qs, const __m128i *mask, + const __m128i *th, int bd, + __m128i *t80) { + __m128i ps0 = _mm_subs_epi16(p[0], *t80); + __m128i ps1 = _mm_subs_epi16(p[1], *t80); + __m128i qs0 = _mm_subs_epi16(q[0], *t80); + __m128i qs1 = _mm_subs_epi16(q[1], *t80); const __m128i one = _mm_set1_epi16(1); const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i zero = _mm_setzero_si128(); - const __m128i pmin = _mm_subs_epi16(zero, t80); + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, *t80); __m128i filter = _mm_subs_epi16(ps1, qs1); pixel_clamp(&pmin, &pmax, &filter); + // hev_filter __m128i hev; - highbd_hev_mask(p, q, th, &hev); + const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); + const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *th); + const __m128i ffff = _mm_cmpeq_epi16(h, h); + hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + filter = _mm_and_si128(filter, hev); const __m128i x = _mm_subs_epi16(qs0, ps0); @@ -172,145 +298,332 @@ static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, filter = _mm_adds_epi16(filter, x); pixel_clamp(&pmin, &pmax, &filter); filter = _mm_and_si128(filter, *mask); - const __m128i t3 = _mm_set1_epi16(3); const __m128i t4 = _mm_set1_epi16(4); - __m128i filter1 = _mm_adds_epi16(filter, t4); __m128i filter2 = _mm_adds_epi16(filter, t3); pixel_clamp(&pmin, &pmax, &filter1); pixel_clamp(&pmin, &pmax, &filter2); filter1 = _mm_srai_epi16(filter1, 3); filter2 = _mm_srai_epi16(filter2, 3); - qs0 = _mm_subs_epi16(qs0, filter1); pixel_clamp(&pmin, &pmax, &qs0); ps0 = _mm_adds_epi16(ps0, filter2); pixel_clamp(&pmin, &pmax, &ps0); - - qs[0] = _mm_adds_epi16(qs0, t80); - ps[0] = _mm_adds_epi16(ps0, t80); - + qs[0] = _mm_adds_epi16(qs0, *t80); + ps[0] = _mm_adds_epi16(ps0, *t80); filter = _mm_adds_epi16(filter1, one); filter = _mm_srai_epi16(filter, 1); filter = _mm_andnot_si128(hev, filter); - qs1 = _mm_subs_epi16(qs1, filter); pixel_clamp(&pmin, &pmax, &qs1); ps1 = _mm_adds_epi16(ps1, filter); pixel_clamp(&pmin, &pmax, &ps1); - - qs[1] = _mm_adds_epi16(qs1, t80); - ps[1] = _mm_adds_epi16(ps1, t80); + qs[1] = _mm_adds_epi16(qs1, *t80); + ps[1] = _mm_adds_epi16(ps1, *t80); } -typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - -static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd, - PixelOutput pixel_output) { +static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( + __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, + const unsigned char *lt, const unsigned char *thr, int bd) { + int i; __m128i blimit, limit, thresh; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); + __m128i t80; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); + + for (i = 0; i < 7; i++) { + pq[i] = _mm_unpacklo_epi64(p[i], q[i]); + } + __m128i mask, hevhev; + __m128i p1p0, q1q0, abs_p1p0; - __m128i p[8], q[8]; - load_highbd_pixel(s, 8, pitch, p, q); + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hevhev, &mask); - __m128i mask; - highbd_filter_mask(p, q, &limit, &blimit, &mask); + __m128i ps0ps1, qs0qs1; + // filter4 + highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); __m128i flat, flat2; - const __m128i one = _mm_set1_epi16(1); - highbd_flat_mask4(&one, p, q, &flat, bd); - highbd_flat_mask5(&one, p, q, &flat2, bd); + highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); flat = _mm_and_si128(flat, mask); flat2 = _mm_and_si128(flat2, flat); - __m128i ps[2], qs[2]; - highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[7], flat2_q[7]; + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; { + __m128i work0; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); + __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); + flat2_q[0] = + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); + + __m128i sum_p6, sum_p3; + sum_p6 = _mm_add_epi16(pq[6], pq[6]); + sum_p3 = _mm_add_epi16(pq[3], pq[3]); + + sum_q = _mm_sub_epi16(sum_p, p[5]); + sum_p = _mm_sub_epi16(sum_p, q[5]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); + flat2_p[1] = _mm_add_epi16(sum_p, work0); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + + work0 = _mm_add_epi16(sum_p3, pq[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + + flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); - __m128i sum_p = - _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); - __m128i sum_q = - _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq[3]); + work0 = _mm_add_epi16(sum_p3, pq[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } + + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // highbd_filter16 + for (i = 5; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + // get values for when (flat2 && flat && mask) + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } +} + +void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + __m128i p[7], q[7], pq[7]; + int i; + + for (i = 0; i < 7; i++) { + p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); + } + + highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); + + for (i = 0; i < 6; i++) { + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( + __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, + const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, + const uint8_t *thr1, int bd) { + __m128i blimit, limit, thresh, t80; + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, + &t80); + __m128i mask; + highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); + __m128i flat, flat2; + highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + __m128i ps[2], qs[2]; + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); + // flat and wide flat calculations + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); - __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - - flat2_p[0] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); - flat2_q[0] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); - - __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); - __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); + __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - - sum_q = _mm_sub_epi16(sum_p, p[6]); - sum_p = _mm_sub_epi16(sum_p, q[6]); - flat2_p[1] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); - flat2_q[1] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); - + sum_q = _mm_sub_epi16(sum_p, p[5]); + sum_p = _mm_sub_epi16(sum_p, q[5]); + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p[7]); - sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p3 = _mm_add_epi16(sum_p3, p[3]); sum_q3 = _mm_add_epi16(sum_q3, q[3]); - - sum_p = _mm_sub_epi16(sum_p, q[5]); - sum_q = _mm_sub_epi16(sum_q, p[5]); - flat2_p[2] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); - flat2_q[2] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); - + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - - int i; - for (i = 3; i < 7; ++i) { - sum_p7 = _mm_add_epi16(sum_p7, p[7]); - sum_q7 = _mm_add_epi16(sum_q7, q[7]); - sum_p = _mm_sub_epi16(sum_p, q[7 - i]); - sum_q = _mm_sub_epi16(sum_q, p[7 - i]); - flat2_p[i] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); - flat2_q[i] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); - } + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); } - // highbd_filter8 p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) @@ -320,7 +633,6 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, q[2] = _mm_andnot_si128(flat, q[2]); flat_q[2] = _mm_and_si128(flat, flat_q[2]); q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; for (i = 1; i >= 0; i--) { ps[i] = _mm_andnot_si128(flat, ps[i]); @@ -330,675 +642,979 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, flat_q[i] = _mm_and_si128(flat, flat_q[i]); q[i] = _mm_or_si128(qs[i], flat_q[i]); } - // highbd_filter16 - - if (pixel_output == FOUR_PIXELS) { - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); - _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); - } - } else { // EIGHT_PIXELS - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); - _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_store_si128((__m128i *)(s + i * pitch), q[i]); - } + for (i = 5; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); } } -// Note: -// highbd_lpf_horz_edge_8_8p() output 8 pixels per register -// highbd_lpf_horz_edge_8_4p() output 4 pixels per register -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 -static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); -} -#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - -static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); -} - -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); -#endif -} - -void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); - highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); -#endif -} - -static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, - const __m128i *p0, const __m128i *q0, - const __m128i *q1, const __m128i *q2, - int p, uint16_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); -#else - _mm_store_si128((__m128i *)(s - 3 * p), *p2); - _mm_store_si128((__m128i *)(s - 2 * p), *p1); - _mm_store_si128((__m128i *)(s - 1 * p), *p0); - _mm_store_si128((__m128i *)(s + 0 * p), *q0); - _mm_store_si128((__m128i *)(s + 1 * p), *q1); - _mm_store_si128((__m128i *)(s + 2 * p), *q2); -#endif +void aom_highbd_lpf_horizontal_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p[7], q[7]; + int i; + load_highbd_pixel(s, 7, pitch, p, q); + + highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + for (i = 0; i < 6; i++) { + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } } -void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, + const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); - const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_cmpeq_epi16(one, one); - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft; + __m128i pq[3]; + __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; + __m128i flat_p1p0, flat_q0q1; - const __m128i t4 = _mm_set1_epi16(4); - const __m128i t3 = _mm_set1_epi16(3); + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); __m128i t80; - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i ps1, ps0, qs0, qs1; - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - t80 = _mm_set1_epi16(0x80); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - t80 = _mm_set1_epi16(0x200); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); - t80 = _mm_set1_epi16(0x800); + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // flat_mask + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); } + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - ps1 = _mm_subs_epi16(p1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); - qs1 = _mm_subs_epi16(q1, t80); + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - // filter_mask and hev_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); +} - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); +static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, + const unsigned char *_thresh0, const unsigned char *_blimit1, + const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat, work; + __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; + __m128i op1, op0, oq0, oq1; + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p2p1 = abs_diff16(*p2, *p1); + abs_p1p0 = abs_diff16(*p1, *p0); + abs_q1q0 = abs_diff16(*q1, *q0); + abs_q2q1 = abs_diff16(*q2, *q1); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - mask = _mm_max_epi16(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - mask = _mm_max_epi16(abs_q1q0, mask); - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + mask = _mm_max_epi16(abs_q2q1, mask); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_max_epi16(mask, abs_p2p1); + mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // flat_mask + flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); + flat = _mm_max_epi16(flat, work); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + op1 = _mm_srli_epi16(workp_shft0, 3); + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + op0 = _mm_srli_epi16(workp_a, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + oq0 = _mm_srli_epi16(workp_shft0, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + oq1 = _mm_srli_epi16(workp_shft1, 3); + } + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); +} + +void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, + _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); +} + +void aom_highbd_lpf_horizontal_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[4]; + __m128i p1p0, q1q0, ps1ps0, qs1qs0; + __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + pq[3] = _mm_unpacklo_epi64(*p3, *q3); + + __m128i abs_p1p0; + + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + // flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); - flat = _mm_max_epi16(abs_q1q0, flat); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); // flat & mask + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); - // Added before shift for rounding part of ROUND_POWER_OF_TWO + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + } + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, + const unsigned char *_limit0, const unsigned char *_thresh0, + const unsigned char *_blimit1, const unsigned char *_limit1, + const unsigned char *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat; + __m128i work_a, op2, oq2, op1, op0, oq0, oq1; + __m128i abs_p1q1, abs_p0q0, work0, work1, work2; + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; + + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); + work1 = + _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat + work0 = _mm_max_epi16(work0, work1); + work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); + work2 = _mm_max_epi16(work2, work0); + mask = _mm_max_epi16(work2, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); + flat = _mm_max_epi16(work1, flat); + work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); + flat = _mm_max_epi16(work0, flat); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + { + __m128i workp_a, workp_b; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + } // lp filter - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i pmin = _mm_subs_epi16(zero, t80); + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } - filt = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); - filt = _mm_and_si128(filt, hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, mask); + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); - filter1 = _mm_adds_epi16(filt, t4); - filter2 = _mm_adds_epi16(filt, t3); + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); - // Filter1 >> 3 - pixel_clamp(&pmin, &pmax, &filter1); - filter1 = _mm_srai_epi16(filter1, 3); + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); - // Filter2 >> 3 - pixel_clamp(&pmin, &pmax, &filter2); - filter2 = _mm_srai_epi16(filter2, 3); + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, + &p1p0, _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_highbd_lpf_horizontal_8_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, + _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } -void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, + __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; - __m128i mask, hev, flat; -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + __m128i mask, hev; + __m128i p1p0, q1q0; + __m128i pq[2]; + + __m128i abs_p1p0; + + __m128i t80; + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + + highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, + __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i mask, flat; + __m128i p[2], q[2]; + + const __m128i zero = _mm_setzero_si128(); + __m128i abs_p0q0 = abs_diff16(*q0, *p0); + __m128i abs_p1q1 = abs_diff16(*q1, *p1); + + __m128i abs_p1p0 = abs_diff16(*p1, *p0); + __m128i abs_q1q0 = abs_diff16(*q1, *q0); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); const __m128i one = _mm_set1_epi16(1); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - const __m128i t4 = _mm_set1_epi16(4); - const __m128i t3 = _mm_set1_epi16(3); __m128i t80; - __m128i tff80; - __m128i tffe0; - __m128i t1f; - // equivalent to shifting 0x1f left by bitdepth - 8 - // and setting new bits to 1 - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i t7f; - // equivalent to shifting 0x7f left by bitdepth - 8 - // and setting new bits to 1 - __m128i ps1, ps0, qs0, qs1; - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - t80 = _mm_set1_epi16(0x80); - tff80 = _mm_set1_epi16(0xff80); - tffe0 = _mm_set1_epi16(0xffe0); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); - t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); - } - ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); mask = _mm_max_epi16(flat, mask); -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); -#endif - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); - // filter4 - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i pmin = _mm_subs_epi16(zero, t80); - - filt = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); - - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi16(filt, t4); - pixel_clamp(&pmin, &pmax, &filter1); - - filter2 = _mm_adds_epi16(filt, t3); - pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm_and_si128(filter1, t1f); // clamp the range - filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; - // Filter2 >> 3 - work_a = _mm_cmpgt_epi16(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, tffe0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); +} - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - work_a = _mm_cmpgt_epi16(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, tff80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - q0 = _mm_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &q0); - q0 = _mm_adds_epi16(q0, t80); - - q1 = _mm_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &q1); - q1 = _mm_adds_epi16(q1, t80); - - p0 = _mm_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &p0); - p0 = _mm_adds_epi16(p0, t80); - - p1 = _mm_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &p1); - p1 = _mm_adds_epi16(p1, t80); -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); -#else - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); -#endif +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p1p0, q1q0; + __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, + _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); } void aom_highbd_lpf_horizontal_4_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i ps[2], qs[2]; + + highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, + _thresh0, _blimit1, _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); + _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); } void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); - uint16_t *src[1]; - uint16_t *dst[1]; + __m128i x0, x1, x2, x3, d0, d1, d2, d3; + __m128i p1p0, q1q0; + __m128i p1, q1; - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); - // Loop filtering - aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, + thresh, bd); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); - src[0] = t_dst; - dst[0] = s - 4; + // transpose from 8x4 to 4x8 + highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); - // Transpose back - highbd_transpose(src, 8, dst, p, 1); + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); } void aom_highbd_lpf_vertical_4_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i ps[2], qs[2]; - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); + + highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3); + + highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, + &d3, &d4, &d5, &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} - // Loop filtering - aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; +void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x3, x2, x1, x0, p0, q0; + __m128i p1p0, q1q0; + + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, + limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0, p1, q1, p2, q2; + + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, + &p0, &q0, &q1, &q2, &d6, &d7); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); } void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); - uint16_t *src[1]; - uint16_t *dst[1]; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p2, p1, p0, p3, q0; + __m128i q1q0, p1p0; - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); + p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); + p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); + p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); // Loop filtering - aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, + &p1p0, blimit, limit, thresh, bd); - src[0] = t_dst; - dst[0] = s - 4; + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, + &d1, &d2, &d3); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); } void aom_highbd_lpf_vertical_8_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3, &d4, &d5, &d6, &d7); + + highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, + blimit0, limit0, thresh0, blimit1, limit1, + thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, + &x2, &x3, &x4, &x5, &x6, &x7); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); + _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); + _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); + _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); + _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); +} - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); +void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i q[7], p[7], pq[7]; + __m128i p6, p5, p4, p3; + __m128i p6_2, p5_2, p4_2, p3_2; + __m128i d0, d1, d2, d3; + __m128i d0_2, d1_2, d2_2, d3_2, d7_2; - // Loop filtering - aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], + &p[3], &p[2], &p[1], &p[0]); - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); -void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); - uint16_t *src[2]; - uint16_t *dst[2]; + highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], + &q[3], &q[4], &q[5], &q[6], &d7_2); - src[0] = s - 8; - src[1] = s; - dst[0] = t_dst; - dst[1] = t_dst + 8 * 8; + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); - // Transpose 16x8 - highbd_transpose(src, p, dst, 8, 2); + highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], + &pq[1], &pq[0], &d0, &d1, &d2, &d3); - // Loop filtering - aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, - bd); - src[0] = t_dst; - src[1] = t_dst + 8 * 8; - dst[0] = s - 8; - dst[1] = s; - - // Transpose back - highbd_transpose(src, 8, dst, p, 2); -} - -void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[256]); - - // Transpose 16x16 - highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); -#else - aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); -#endif - // Transpose back - highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + q[0] = _mm_srli_si128(pq[0], 8); + q[1] = _mm_srli_si128(pq[1], 8); + q[2] = _mm_srli_si128(pq[2], 8); + q[3] = _mm_srli_si128(pq[3], 8); + q[4] = _mm_srli_si128(pq[4], 8); + q[5] = _mm_srli_si128(pq[5], 8); + + highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], + &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); +} + +void aom_highbd_lpf_vertical_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i q[7], p[7]; + __m128i p6, p5, p4, p3, p2, p1, p0, q0; + __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; + __m128i d0, d7; + __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); + p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); + p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); + q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], + &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, + &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], + &q[6], &d7); + + highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); + + highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); } diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c index 2bbf15ef2..dea113a29 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,7 +11,8 @@ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 855bc6558..e0d22522d 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -288,11 +288,9 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 HIGH_SADNXN4D 16, 64 HIGH_SADNXN4D 64, 16 -%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 760e68aab..3398d8a2a 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -158,10 +158,8 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 -%endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -230,10 +228,8 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 -%endif ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -302,12 +298,10 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 -%endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -376,7 +370,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 -%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm index ee19796e3..61f5b8e86 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -94,7 +94,7 @@ SECTION .text %define filter_idx_shift 5 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ @@ -102,19 +102,20 @@ SECTION .text sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -133,8 +134,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, \ - sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height heightd ; Store bilin_filter and pw_8 location in stack @@ -153,22 +155,16 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, sse + x_offset, y_offset, \ + dst, dst_stride, height, sse %define block_height heightd %endif @@ -287,14 +283,14 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -311,7 +307,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -514,14 +510,14 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -538,7 +534,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -636,14 +632,14 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -660,7 +656,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -735,14 +731,14 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -759,7 +755,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -862,8 +858,8 @@ SECTION .text .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift @@ -872,7 +868,7 @@ SECTION .text mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -900,7 +896,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c index befd81269..18eb03d12 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -13,8 +13,8 @@ #include <emmintrin.h> #include <stddef.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, @@ -204,21 +204,15 @@ SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } -#if CONFIG_EXT_PARTITION SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } -#endif SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } -#if CONFIG_EXT_PARTITION -SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); } -SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); } -#endif static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { if (rows == 4) { @@ -244,25 +238,17 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { if (cols == 16) return subtract_16x32; if (cols == 32) return subtract_32x32; if (cols == 64) return subtract_64x32; -#if CONFIG_EXT_PARTITION - if (cols == 128) return subtract_128x32; -#endif // CONFIG_EXT_PARTITION } if (rows == 64) { if (cols == 16) return subtract_16x64; if (cols == 32) return subtract_32x64; if (cols == 64) return subtract_64x64; -#if CONFIG_EXT_PARTITION if (cols == 128) return subtract_128x64; -#endif // CONFIG_EXT_PARTITION } -#if CONFIG_EXT_PARTITION if (rows == 128) { - if (cols == 32) return subtract_32x128; if (cols == 64) return subtract_64x128; if (cols == 128) return subtract_128x128; } -#endif // CONFIG_EXT_PARTITION assert(0); return NULL; } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm index cf8ea498c..0d954e178 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -14,6 +14,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ;unsigned int aom_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 62acf3ed3..fdfadc886 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -12,13 +12,17 @@ #include <assert.h> #include <emmintrin.h> // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" -#include "./av1_rtcd.h" #include "av1/common/filter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, @@ -185,13 +189,11 @@ VAR_FN(16, 16, 16, 8); VAR_FN(16, 8, 8, 7); VAR_FN(8, 16, 8, 7); VAR_FN(8, 8, 8, 6); -#if CONFIG_EXT_PARTITION_TYPES VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); -VAR_FN(32, 8, 16, 8); +VAR_FN(32, 8, 8, 8); VAR_FN(16, 64, 16, 10); VAR_FN(64, 16, 16, 10); -#endif #undef VAR_FN @@ -398,7 +400,6 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION_TYPES #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ FN(64, 32, 16, 6, 5, opt, (int64_t)); \ @@ -416,20 +417,6 @@ DECLS(sse2); FN(32, 8, 16, 5, 3, opt, (int64_t)); \ FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)) -#endif FNS(sse2); @@ -577,7 +564,6 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION_TYPES #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ FN(64, 32, 16, 6, 5, opt, (int64_t)); \ @@ -595,30 +581,104 @@ DECLS(sse2); FN(32, 8, 16, 5, 3, opt, (int64_t)); \ FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)); -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); -#endif FNS(sse2); #undef FNS #undef FN -void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint16_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); if (width >= 8) { @@ -648,54 +708,48 @@ void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, ref += 2 * ref_stride; } } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, kernel, 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, NULL, -1, kernel, 16, width, height, bd); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, - CONVERT_TO_BYTEPTR(comp_pred), width, kernel, - 16, NULL, -1, width, height, bd); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); - } else { - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); - } + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, + 16, width, height, bd); } } -void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, - const uint8_t *pred8, int width, - int height, int subpel_x_q3, - int subpel_y_q3, - const uint8_t *ref8, - int ref_stride, int bd) { +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; int i; - aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, - ref8, ref_stride, bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); n = width * height >> 3; @@ -707,3 +761,102 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, pred += 8; } } + +static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // Read 8 pixels one row at a time + assert(!(width & 7)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 8) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + // Read 4 pixels two rows at a time + assert(!(width & 3)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 2 * ref_stride; + } + } +} + +void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); + assert(!(width * height & 7)); + n = width * height >> 3; + + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c index cc7f52811..6c247a91b 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -11,8 +11,8 @@ #include <smmintrin.h> /* SSE4.1 */ -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/variance.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c index 6b8922b8c..1e67d392e 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -11,7 +11,20 @@ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_64(const uint8_t *ref) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i zero = _mm256_setzero_si256(); + __m256i y0 = _mm256_sad_epu8(x0, zero); + __m256i y1 = _mm256_sad_epu8(x1, zero); + y0 = _mm256_add_epi64(y0, y1); + __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); + y0 = _mm256_add_epi64(u0, y0); + u0 = _mm256_unpackhi_epi64(y0, y0); + return _mm256_add_epi16(y0, u0); +} static INLINE __m256i dc_sum_32(const uint8_t *ref) { const __m256i x = _mm256_loadu_si256((const __m256i *)ref); @@ -25,13 +38,31 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) { static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, ptrdiff_t stride) { - int i; - for (i = 0; i < height; ++i) { + for (int i = 0; i < height; ++i) { _mm256_storeu_si256((__m256i *)dst, *r); dst += stride; } } +static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, + int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r0); + _mm256_storeu_si256((__m256i *)(dst + 32), *r1); + dst += stride; + } +} + +static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + _mm256_storeu_si256((__m256i *)(dst + 32), *r); + dst += stride; + } +} + void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_32(above); @@ -168,11 +199,58 @@ void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(left_sum); sum += 24; sum /= 48; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); row_store_32xh(&row, 16, dst, stride); } +void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 64; + sum /= 128; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 40; + sum /= 80; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -187,6 +265,62 @@ void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -202,6 +336,63 @@ void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -211,6 +402,42 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 16, dst, stride); +} + void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row = _mm256_loadu_si256((const __m256i *)above); @@ -218,8 +445,39 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 64, dst, stride); +} + +void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 64, dst, stride); +} + +void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 32, dst, stride); +} + +void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 16, dst, stride); +} + // ----------------------------------------------------------------------------- -// TM_PRED +// PAETH_PRED // Return 16 16-bit pixels in one row (__m256i) static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, @@ -336,6 +594,26 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + for (int j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + // Return 32 8-bit pixels in one row (__m256i) static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, const __m256i *top1, @@ -411,3 +689,123 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, rep = _mm256_add_epi16(rep, one); } } + +void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 2; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i; + const __m256i l = get_left_vector(left); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c index 2a83b9001..5b2452c8e 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -11,11 +11,11 @@ #include <emmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" -static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { - int i; - for (i = 0; i < 4; ++i) { +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { *(uint32_t *)dst = dc; dst += stride; *(uint32_t *)dst = dc; @@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, } } +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + static INLINE __m128i dc_sum_4(const uint8_t *ref) { __m128i x = _mm_loadl_epi64((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); @@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) { return _mm_add_epi16(x0, high); } +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + // ----------------------------------------------------------------------------- // DC_PRED @@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); const uint32_t pred = _mm_cvtsi128_si32(row); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 4, dst, stride); @@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); @@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 8, dst, stride); } @@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32(above); @@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_TOP @@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_above = _mm_packus_epi16(sum_above, sum_above); const uint32_t pred = _mm_cvtsi128_si32(sum_above); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; @@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_LEFT @@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_left = _mm_packus_epi16(sum_left, sum_left); const uint32_t pred = _mm_cvtsi128_si32(sum_left); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -333,6 +683,62 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_128 @@ -341,7 +747,15 @@ void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, (void)above; (void)left; const uint32_t pred = 0x80808080; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // V_PRED @@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint32_t pred = *(uint32_t *)above; (void)left; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); } void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); @@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } -void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { const __m128i row0 = _mm_load_si128((__m128i const *)above); const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { (void)left; - int i; - for (i = 0; i < 16; ++i) { + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, row0); _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); dst += stride; } } +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + // ----------------------------------------------------------------------------- // H_PRED @@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, *(uint32_t *)dst = _mm_cvtsi128_si32(row3); } -void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i left_col = _mm_loadl_epi64((__m128i const *)left); - left_col = _mm_unpacklo_epi8(left_col, left_col); - __m128i row0 = _mm_shufflelo_epi16(left_col, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_load_si128((__m128i const *)left); @@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); @@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_low, 0x55); row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); @@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); @@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, _mm_storel_epi64((__m128i *)dst, row3); } +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, ptrdiff_t stride) { int i; @@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, h_pred_store_16xh(row, 4, dst, stride); } +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, h_prediction_16x8_2(&left_col_8p, dst, stride); } -void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i left_col, left_col_8p; - (void)above; +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { int i = 0; - do { - left_col = _mm_load_si128((const __m128i *)left); - left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); dst += stride << 2; - left_col_8p = _mm_unpackhi_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); dst += stride << 2; left += 16; i++; - } while (i < 2); + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); } static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, @@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, h_pred_store_32xh(row, 4, dst, stride); } +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i left_col, left_col_8p; @@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); } + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c index 85b82744e..807ed1770 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -11,11 +11,12 @@ #include <tmmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- -// TM_PRED +// PAETH_PRED // Return 8 16-bit pixels in one row static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, @@ -82,6 +83,26 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); @@ -145,6 +166,28 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + // Return 16 8-bit pixels in one row static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, const __m128i *top1, @@ -154,6 +197,27 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, return _mm_packus_epi16(p0, p1); } +void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); @@ -234,6 +298,57 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + const __m128i l = _mm_loadl_epi64((const __m128i *)left); + __m128i l16; + + for (int i = 0; i < 8; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -307,6 +422,162 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + const __m128i l = _mm_load_si128((const __m128i *)left); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + // ----------------------------------------------------------------------------- // SMOOTH_PRED @@ -315,9 +586,15 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, // pixels[2]: right_pred vector static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { - __m128i d = _mm_loadl_epi64((const __m128i *)above); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + if (height == 4) + pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[1] = _mm_loadu_si128(((const __m128i *)left)); + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); - pixels[1] = _mm_loadl_epi64((const __m128i *)left); const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); const __m128i zero = _mm_setzero_si128(); @@ -325,45 +602,52 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, pixels[0] = _mm_unpacklo_epi16(d, bp); } -// weights[0]: weights_h vector -// weights[1]: scale - weights_h vecotr -// weights[2]: weights_w and scale - weights_w interleave vector +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector static INLINE void load_weight_w4(const uint8_t *weight_array, int height, - __m128i *weights) { - __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + __m128i *weight_h, __m128i *weight_w) { const __m128i zero = _mm_setzero_si128(); - - weights[0] = _mm_unpacklo_epi8(t, zero); const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - weights[1] = _mm_sub_epi16(d, weights[0]); - weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weight_h[0] = _mm_unpacklo_epi8(t, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); if (height == 8) { - t = _mm_srli_si128(t, 4); - weights[0] = _mm_unpacklo_epi8(t, zero); - weights[1] = _mm_sub_epi16(d, weights[0]); + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); } } -static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight, - int h, uint8_t *dst, ptrdiff_t stride) { +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); const __m128i one = _mm_set1_epi16(1); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); __m128i d = _mm_set1_epi16(0x100); - int i; - for (i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s = _mm_madd_epi16(pixel[0], wh_sc); __m128i b = _mm_shuffle_epi8(pixel[1], rep); b = _mm_unpacklo_epi16(b, pixel[2]); - __m128i sum = _mm_madd_epi16(b, weight[2]); + __m128i sum = _mm_madd_epi16(b, ww[0]); sum = _mm_add_epi32(s, sum); sum = _mm_add_epi32(sum, round); @@ -383,10 +667,10 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, __m128i pixels[3]; load_pixel_w4(above, left, 4, pixels); - __m128i weights[3]; - load_weight_w4(sm_weight_arrays, 4, weights); + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 4, wh, ww); - smooth_pred_4xh(pixels, weights, 4, dst, stride); + smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); } void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, @@ -394,33 +678,68 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, __m128i pixels[3]; load_pixel_w4(above, left, 8, pixels); - __m128i weights[3]; - load_weight_w4(sm_weight_arrays, 8, weights); + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 8, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 16, wh, ww); - smooth_pred_4xh(pixels, weights, 8, dst, stride); + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); } // pixels[0]: above and below_pred interleave vector, first half // pixels[1]: above and below_pred interleave vector, second half // pixels[2]: left vector // pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { - __m128i d = _mm_loadl_epi64((const __m128i *)above); - pixels[3] = _mm_set1_epi16((uint16_t)above[7]); - pixels[2] = _mm_load_si128((const __m128i *)left); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); const __m128i zero = _mm_setzero_si128(); - + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + __m128i d = _mm_loadl_epi64((const __m128i *)above); d = _mm_unpacklo_epi8(d, zero); pixels[0] = _mm_unpacklo_epi16(d, bp); pixels[1] = _mm_unpackhi_epi16(d, bp); + + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[2] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[2] = _mm_load_si128((const __m128i *)left); + } else { + pixels[2] = _mm_load_si128((const __m128i *)left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[7] = pixels[3]; + } } // weight_h[0]: weight_h vector // weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], second half for height = 16 only -// weight_h[3]: same as [1], second half for height = 16 only +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half // weight_w[1]: weights_w and scale - weights_w interleave vector, second half static INLINE void load_weight_w8(const uint8_t *weight_array, int height, @@ -429,7 +748,6 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height, const int we_offset = height < 8 ? 4 : 8; __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); weight_h[0] = _mm_unpacklo_epi8(we, zero); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); @@ -450,6 +768,19 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height, weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_h[2] = _mm_unpackhi_epi8(we, zero); weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else if (height == 32) { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); } } @@ -531,355 +862,831 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); } -// pixels[0]: above and below_pred interleave vector, 1/4 -// pixels[1]: above and below_pred interleave vector, 2/4 -// pixels[2]: above and below_pred interleave vector, 3/4 -// pixels[3]: above and below_pred interleave vector, 3/4 -// pixels[4]: left vector -// pixels[5]: left vector, h = 32 only -// pixels[6]: right_pred vector -static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - __m128i ab = _mm_load_si128((const __m128i *)above); - pixels[6] = _mm_set1_epi16((uint16_t)above[15]); - pixels[4] = _mm_load_si128((const __m128i *)left); - pixels[5] = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); +void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[8]; + load_pixel_w8(above, left, 32, pixels); + + __m128i wh[8], ww[2]; + load_weight_w8(sm_weight_arrays, 32, wh, ww); + + smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i top_right = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); + pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); + const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); + + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + const __m128i scale_m_weights_x = + _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); + const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); + const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); + const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); + + pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); + pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); + + pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); + pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); + + pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); + pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 4); +} - __m128i x = _mm_unpacklo_epi8(ab, zero); - pixels[0] = _mm_unpacklo_epi16(x, bp); - pixels[1] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 8); +} - x = _mm_unpackhi_epi8(ab, zero); - pixels[2] = _mm_unpacklo_epi16(x, bp); - pixels[3] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 16); } -// weight_h[0]: weight_h vector -// weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], second half for height = 16 only -// weight_h[3]: same as [1], second half for height = 16 only -// ... ... -// weight_w[0]: weights_w and scale - weights_w interleave vector, first half -// weight_w[1]: weights_w and scale - weights_w interleave vector, second half -// ... ... -static INLINE void load_weight_w16(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { - const __m128i zero = _mm_setzero_si128(); - __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]); - __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); - __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); - __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 32); +} - if (height == 8) { - weight_h[0] = _mm_unpacklo_epi8(w8, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h +void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 8); +} - __m128i x = _mm_unpacklo_epi8(w16, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); - x = _mm_unpackhi_epi8(w16, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +// pixels[0]: above and below_pred interleave vector +static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vector +static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 4) { + const __m128i weight = + _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(d, weights[2]); } +} - if (height == 16) { - weight_h[0] = _mm_unpacklo_epi8(w16, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w16, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +static INLINE void smooth_v_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i d = _mm_set1_epi16(0x100); - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); - weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); - weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i sum = _mm_madd_epi16(pixel[0], wh_sc); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + d = _mm_add_epi16(d, inc); } +} - if (height == 32) { - weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 4, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 4, weights); + + smooth_v_pred_4xh(&pixels, weights, 4, dst, stride); +} + +void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 8, &pixels); - __m128i x = _mm_unpacklo_epi8(w16, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); - x = _mm_unpackhi_epi8(w16, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 8, weights); - weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); +} + +void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 16, &pixels); + + __m128i weights[4]; + load_weight_v_w4(sm_weight_arrays, 16, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height, + __m128i *weight_h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height < 16) { + const int offset = height < 8 ? 4 : 8; + const __m128i weight = + _mm_loadu_si128((const __m128i *)&weight_array[offset]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); weight_h[7] = _mm_sub_epi16(d, weight_h[6]); } } -static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh, - const __m128i *ww, uint8_t *dst, - ptrdiff_t stride, int quarter) { - __m128i d = _mm_set1_epi16(0x100); - const __m128i one = _mm_set1_epi16(1); +static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - __m128i rep = - (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); - const __m128i left = (quarter < 2) ? pixels[4] : pixels[5]; + __m128i d = _mm_set1_epi16(0x100); - int i; - for (i = 0; i < 8; ++i) { + for (int i = 0; i < h; ++i) { const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); - __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc); - __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc); - __m128i b = _mm_shuffle_epi8(left, rep); - b = _mm_unpacklo_epi16(b, pixels[6]); - __m128i sum0 = _mm_madd_epi16(b, ww[0]); - __m128i sum1 = _mm_madd_epi16(b, ww[1]); - __m128i sum2 = _mm_madd_epi16(b, ww[2]); - __m128i sum3 = _mm_madd_epi16(b, ww[3]); + s0 = _mm_add_epi32(s0, pred_round); + s0 = _mm_srai_epi32(s0, sm_weight_log2_scale); - s0 = _mm_add_epi32(s0, sum0); - s0 = _mm_add_epi32(s0, round); - s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); - - s1 = _mm_add_epi32(s1, sum1); - s1 = _mm_add_epi32(s1, round); - s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); - - s2 = _mm_add_epi32(s2, sum2); - s2 = _mm_add_epi32(s2, round); - s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale); - - s3 = _mm_add_epi32(s3, sum3); - s3 = _mm_add_epi32(s3, round); - s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale); - - sum0 = _mm_packus_epi16(s0, s1); - sum0 = _mm_shuffle_epi8(sum0, gat); - sum1 = _mm_packus_epi16(s2, s3); - sum1 = _mm_shuffle_epi8(sum1, gat); - - _mm_storel_epi64((__m128i *)dst, sum0); - _mm_storel_epi64((__m128i *)(dst + 8), sum1); + s1 = _mm_add_epi32(s1, pred_round); + s1 = _mm_srai_epi32(s1, sm_weight_log2_scale); + __m128i sum01 = _mm_packus_epi16(s0, s1); + sum01 = _mm_shuffle_epi8(sum01, gat); + _mm_storel_epi64((__m128i *)dst, sum01); dst += stride; - rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); } } -void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 8, pixels); +void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 4, pixels); - __m128i wh[2], ww[4]; - load_weight_w16(sm_weight_arrays, 8, wh, ww); + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 4, wh); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + smooth_v_pred_8xh(pixels, wh, 4, dst, stride); } -void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 16, pixels); + __m128i pixels[2]; + load_pixel_v_w8(above, left, 8, pixels); - __m128i wh[4], ww[4]; - load_weight_w16(sm_weight_arrays, 16, wh, ww); + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 8, wh); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); +} + +void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 16, pixels); + + __m128i wh[4]; + load_weight_v_w8(sm_weight_arrays, 16, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); } -void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 32, pixels); +void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 32, pixels); - __m128i wh[8], ww[4]; - load_weight_w16(sm_weight_arrays, 32, wh, ww); + __m128i wh[8]; + load_weight_v_w8(sm_weight_arrays, 32, wh); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2); + smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3); + smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride); } -static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - __m128i ab0 = _mm_load_si128((const __m128i *)above); - __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16)); +static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i bottom_left = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = + _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i scale_m_weights_y = + _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + // 8 -> 16 + const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero); + const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y); + const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y); + // top_x * weights_y + scale_m_weights_y * bottom_left + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + pred_lo = _mm_add_epi32(pred_lo, round); + pred_hi = _mm_add_epi32(pred_hi, round); + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} - pixels[10] = _mm_set1_epi16((uint16_t)above[31]); - pixels[8] = _mm_load_si128((const __m128i *)left); - pixels[9] = _mm_load_si128((const __m128i *)(left + 16)); +void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 4); +} - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - const __m128i zero = _mm_setzero_si128(); +void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 8); +} - __m128i x = _mm_unpacklo_epi8(ab0, zero); - pixels[0] = _mm_unpacklo_epi16(x, bp); - pixels[1] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 8); +} - x = _mm_unpackhi_epi8(ab0, zero); - pixels[2] = _mm_unpacklo_epi16(x, bp); - pixels[3] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 16); +} - x = _mm_unpacklo_epi8(ab1, zero); - pixels[4] = _mm_unpacklo_epi16(x, bp); - pixels[5] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 32); +} - x = _mm_unpackhi_epi8(ab1, zero); - pixels[6] = _mm_unpacklo_epi16(x, bp); - pixels[7] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 64); } -static INLINE void load_weight_w32(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { +void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED + +// pixels[0]: left vector +// pixels[1]: right_pred vector +static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + if (height == 4) + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[0] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[0] = _mm_loadu_si128(((const __m128i *)left)); + pixels[1] = _mm_set1_epi16((uint16_t)above[3]); +} + +// weights[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + (void)height; + const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); const __m128i zero = _mm_setzero_si128(); - __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); - __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); - __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + + const __m128i weights_0 = _mm_unpacklo_epi8(t, zero); const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i weights_1 = _mm_sub_epi16(d, weights_0); + weights[0] = _mm_unpacklo_epi16(weights_0, weights_1); +} - if (height == 16) { - weight_h[0] = _mm_unpacklo_epi8(w16, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w16, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +static INLINE void smooth_h_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); - __m128i x = _mm_unpacklo_epi8(w32_0, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixel[0], rep); + b = _mm_unpacklo_epi16(b, pixel[1]); + __m128i sum = _mm_madd_epi16(b, weight[0]); - x = _mm_unpackhi_epi8(w32_0, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); - x = _mm_unpacklo_epi8(w32_1, zero); - y = _mm_sub_epi16(d, x); - weight_w[4] = _mm_unpacklo_epi16(x, y); - weight_w[5] = _mm_unpackhi_epi16(x, y); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; - x = _mm_unpackhi_epi8(w32_1, zero); - y = _mm_sub_epi16(d, x); - weight_w[6] = _mm_unpacklo_epi16(x, y); - weight_w[7] = _mm_unpackhi_epi16(x, y); + rep = _mm_add_epi16(rep, one); } +} - if (height == 32) { - weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 4, pixels); - weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); - weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); - weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 4, &weights); - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); - weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); - weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + smooth_h_pred_4xh(pixels, &weights, 4, dst, stride); +} - weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]); - weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]); - weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]); - weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]); +void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 8, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 16, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); + dst += stride << 3; + + pixels[0] = _mm_srli_si128(pixels[0], 8); + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +// pixels[0]: left vector +// pixels[1]: right_pred vector +// pixels[2]: left vector + 16 +// pixels[3]: right_pred vector +static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + pixels[1] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[0] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[0] = _mm_load_si128((const __m128i *)left); + } else { + pixels[0] = _mm_load_si128((const __m128i *)left); + pixels[2] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[3] = pixels[1]; } } -static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh, - const __m128i *ww, uint8_t *dst, - ptrdiff_t stride, int quarter) { - __m128i d = _mm_set1_epi16(0x100); +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height, + __m128i *weight_w) { + (void)height; + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]); + const __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + const __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); +} + +static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, + int h, uint8_t *dst, ptrdiff_t stride, + int second_half) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i one = _mm_set1_epi16(1); - const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - __m128i rep = - (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); - const __m128i left = (quarter < 2) ? pixels[8] : pixels[9]; + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); - int i; - for (i = 0; i < 8; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixels[0], rep); + b = _mm_unpacklo_epi16(b, pixels[1]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); - int j; - __m128i s[8]; - __m128i b = _mm_shuffle_epi8(left, rep); - b = _mm_unpacklo_epi16(b, pixels[10]); + sum0 = _mm_add_epi32(sum0, pred_round); + sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale); - for (j = 0; j < 8; ++j) { - s[j] = _mm_madd_epi16(pixels[j], wh_sc); - s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j])); - s[j] = _mm_add_epi32(s[j], round); - s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale); - } + sum1 = _mm_add_epi32(sum1, pred_round); + sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale); - for (j = 0; j < 8; j += 2) { - __m128i sum = _mm_packus_epi16(s[j], s[j + 1]); - sum = _mm_shuffle_epi8(sum, gat); - _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum); - } + sum0 = _mm_packus_epi16(sum0, sum1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); dst += stride; + rep = _mm_add_epi16(rep, one); - d = _mm_add_epi16(d, inc); } } -void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[11]; - load_pixel_w32(above, left, 16, pixels); + __m128i pixels[2]; + load_pixel_h_w8(above, left, 4, pixels); - __m128i wh[4], ww[8]; - load_weight_w32(sm_weight_arrays, 16, wh, ww); + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 4, ww); - smooth_pred_32x8(pixels, wh, ww, dst, stride, 0); - dst += stride << 3; - smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0); } -void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[11]; - load_pixel_w32(above, left, 32, pixels); + __m128i pixels[2]; + load_pixel_h_w8(above, left, 8, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 8, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 16, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 16, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1); +} - __m128i wh[8], ww[8]; - load_weight_w32(sm_weight_arrays, 32, wh, ww); +void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_h_w8(above, left, 32, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 32, ww); - smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0); + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2); + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i tr_ly = + _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero); + const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw); + const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw); + const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw); + __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly); + __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly); + + pred_lo = _mm_add_epi32(pred_lo, pred_round); + pred_hi = _mm_add_epi32(pred_hi, pred_round); + + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 64); +} + +void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 16); } diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm deleted file mode 100644 index bc1bb2ff3..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm +++ /dev/null @@ -1,410 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 -sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 -sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 -sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM ssse3 -cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - pshufb m1, m3, [GLOBAL(sh_b23456777)] - pshufb m2, m3, [GLOBAL(sh_b12345677)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movd [dstq ], m3 - movd [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m3, 1 - psrldq m4, 1 - movd [dstq ], m3 - movd [dstq+strideq], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; l1, l2, l3, l4 - movd m1, [aboveq-1] ; tl, t1, t2, t3 - punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 - pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 - psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 - psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 - ; comments below are for a predictor like this - ; A1 B1 C1 D1 - ; A2 B2 A1 B1 - ; A3 B3 A2 B2 - ; A4 B4 A3 B3 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 - pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 - - punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+stride3q ], m3 - psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq*2], m3 - psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq ], m3 - psrldq m3, 2 ; A1 B1 C1 D1 .. - movd [dstq ], m3 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - movq m0, [leftq] ; [0- 7] l1-8 [byte] - movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] - pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] - pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] - pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] - pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] - psrldq m4, m0, 1 ; t1-7 [word] - psrldq m5, m0, 2 ; t2-7 [word] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 - ; A2 B2 A1 B1 C1 D1 E1 F1 - ; A3 B3 A2 B2 A1 B1 C1 D1 - ; A4 B4 A3 B3 A2 B2 A1 B1 - ; A5 B5 A4 B4 A3 B3 A2 B2 - ; A6 B6 A5 B5 A4 B4 A3 B3 - ; A7 B7 A6 B6 A5 B5 A4 B4 - ; A8 B8 A7 B7 A6 B6 A5 B5 - pavgb m6, m1, m2 ; 2-tap avg A8-A1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 - - punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - - movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 - palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 - movq [dstq+strideq*2], m0 - psrldq m0, 2 ; A-B2, A-B1, C-H1 - movq [dstq+strideq ], m0 - psrldq m0, 2 ; A-H1 - movq [dstq ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 - psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 - movq [dstq+strideq*2], m6 - psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 - movq [dstq+strideq ], m6 - psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 - movq [dstq ], m6 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 - ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 - ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 - ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 - ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 - ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 - ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 - ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 - ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 - ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 - ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 - ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 - ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 - ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 - ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 - ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 - pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m6, 15 - palignr m3, m0, m6, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] - pavgb m5, m0 ; A1 - Ag - - punpcklbw m0, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - - pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] - pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 - - pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - palignr m2, m1, m6, 14 - mova [dstq ], m2 - palignr m2, m1, m6, 12 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 10 - mova [dstq+strideq*2], m2 - palignr m2, m1, m6, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m1, m6, 6 - mova [dstq ], m2 - palignr m2, m1, m6, 4 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 2 - mova [dstq+strideq*2], m2 - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - mova [dstq+stride3q ], m6 - lea dstq, [dstq+strideq*4] - - palignr m2, m6, m4, 14 - mova [dstq ], m2 - palignr m2, m6, m4, 12 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 10 - mova [dstq+strideq*2], m2 - palignr m2, m6, m4, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m6, m4, 6 - mova [dstq ], m2 - palignr m2, m6, m4, 4 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 2 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - movu m1, [aboveq+15] - - pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] - - palignr m3, m1, m7, 1 - palignr m5, m1, m7, 2 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] - - pshufb m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m7, 15 - palignr m3, m0, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pavgb m5, m0 ; A1 - Ag - punpcklbw m6, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - pshufb m6, [GLOBAL(sh_bfedcba9876543210)] - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - - DEFINE_ARGS dst, stride, stride3, left, line - lea stride3q, [strideq*3] - - palignr m5, m2, m1, 14 - palignr m7, m1, m6, 14 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 12 - palignr m7, m1, m6, 12 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 10 - palignr m7, m1, m6, 10 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - palignr m5, m2, m1, 8 - palignr m7, m1, m6, 8 - mova [dstq+stride3q ], m7 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m2, m1, 6 - palignr m7, m1, m6, 6 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 4 - palignr m7, m1, m6, 4 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 2 - palignr m7, m1, m6, 2 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m6 - mova [dstq+stride3q+16 ], m1 - lea dstq, [dstq+strideq*4] - - palignr m5, m1, m6, 14 - palignr m3, m6, m4, 14 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 12 - palignr m3, m6, m4, 12 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 10 - palignr m3, m6, m4, 10 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - palignr m5, m1, m6, 8 - palignr m3, m6, m4, 8 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m1, m6, 6 - palignr m3, m6, m4, 6 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 4 - palignr m3, m6, m4, 4 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 2 - palignr m3, m6, m4, 2 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m4 - mova [dstq+stride3q+16 ], m6 - lea dstq, [dstq+strideq*4] - - mova m7, [leftq] - mova m3, [leftq+16] - palignr m5, m3, m7, 15 - palignr m0, m3, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - - pavgb m5, m3 ; Ah - - punpcklbw m3, m2, m5 ; A-B8 ... A-B1 - punpckhbw m2, m5 ; A-B9 ... A-Bg - pshufb m3, [GLOBAL(sh_bfedcba9876543210)] - pshufb m2, [GLOBAL(sh_bfedcba9876543210)] - - palignr m7, m6, m4, 14 - palignr m0, m4, m3, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 12 - palignr m0, m4, m3, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 10 - palignr m0, m4, m3, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m6, m4, 8 - palignr m0, m4, m3, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m6, m4, 6 - palignr m0, m4, m3, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 4 - palignr m0, m4, m3, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 2 - palignr m0, m4, m3, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m4 - lea dstq, [dstq+strideq*4] - - palignr m7, m4, m3, 14 - palignr m0, m3, m2, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 12 - palignr m0, m3, m2, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 10 - palignr m0, m3, m2, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m4, m3, 8 - palignr m0, m3, m2, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m4, m3, 6 - palignr m0, m3, m2, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 4 - palignr m0, m3, m2, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 2 - palignr m0, m3, m2, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m3 - - RESTORE_GOT - RET diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c deleted file mode 100644 index a9d6a127c..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c +++ /dev/null @@ -1,1238 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/x86/inv_txfm_common_avx2.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - load_buffer_16x16(input, in); - mm256_transpose_16x16(in, in); - av1_idct16_avx2(in); - mm256_transpose_16x16(in, in); - av1_idct16_avx2(in); - store_buffer_16xN(in, stride, dest, 16); -} - -static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) { - const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); - const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]); - const __m256i v0 = _mm256_unpacklo_epi32(u0, u1); - const __m256i v1 = _mm256_unpackhi_epi32(u0, u1); - in[0] = _mm256_permute4x64_epi64(v0, 0xA8); - in[1] = _mm256_permute4x64_epi64(v0, 0xA9); - in[2] = _mm256_permute4x64_epi64(v1, 0xA8); - in[3] = _mm256_permute4x64_epi64(v1, 0xA9); -} - -#define MM256_SHUFFLE_EPI64(x0, x1, imm8) \ - _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \ - _mm256_castsi256_pd(x1), imm8)) - -static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) { - int i; - for (i = 0; i < 16; i += 4) { - transpose_col_to_row_nz4x4(&in[i]); - } - - for (i = 0; i < 4; ++i) { - in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0); - in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0); - } - - for (i = 0; i < 4; ++i) { - in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); - } -} - -// Coefficients 0-7 before the final butterfly -static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) { - const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28); - const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04); - - const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i v1 = v0; - - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - __m256i v5, v6; - unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6); - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v1, v5); - out[3] = _mm256_add_epi16(v0, v4); - out[4] = _mm256_sub_epi16(v0, v4); - out[5] = _mm256_sub_epi16(v1, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -// Coefficients 8-15 before the final butterfly -static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) { - const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); - const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); - - const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); - const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); - - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - __m256i t1, t2, t5, t6; - unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); - unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); - - out[0] = _mm256_add_epi16(t0, t3); - out[1] = _mm256_add_epi16(t1, t2); - out[6] = _mm256_add_epi16(t6, t5); - out[7] = _mm256_add_epi16(t7, t4); - - const __m256i v2 = _mm256_sub_epi16(t1, t2); - const __m256i v3 = _mm256_sub_epi16(t0, t3); - const __m256i v4 = _mm256_sub_epi16(t7, t4); - const __m256i v5 = _mm256_sub_epi16(t6, t5); - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); - unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); -} - -static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm256_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); - i++; - } -} - -static INLINE void idct16_10(__m256i *in /*in[16]*/) { - __m256i out[16]; - idct16_10_first_half(in, out); - idct16_10_second_half(in, &out[8]); - add_sub_butterfly(out, in, 16); -} - -void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - - load_coeff(input, &in[0]); - load_coeff(input + 16, &in[1]); - load_coeff(input + 32, &in[2]); - load_coeff(input + 48, &in[3]); - - transpose_col_to_row_nz4x4(in); - idct16_10(in); - - transpose_col_to_row_nz4x16(in); - idct16_10(in); - - store_buffer_16xN(in, stride, dest, 16); -} - -// Note: -// For 16x16 int16_t matrix -// transpose first 8 columns into first 8 rows. -// Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]). -// After transposing, the 8 row vectors are in in[8]. -void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) { - __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]); - - const __m256i v0 = _mm256_unpacklo_epi32(u0, u2); - const __m256i v1 = _mm256_unpackhi_epi32(u0, u2); - const __m256i v2 = _mm256_unpacklo_epi32(u1, u3); - const __m256i v3 = _mm256_unpackhi_epi32(u1, u3); - - u0 = _mm256_unpacklo_epi16(in[4], in[5]); - u1 = _mm256_unpackhi_epi16(in[4], in[5]); - u2 = _mm256_unpacklo_epi16(in[6], in[7]); - u3 = _mm256_unpackhi_epi16(in[6], in[7]); - - const __m256i v4 = _mm256_unpacklo_epi32(u0, u2); - const __m256i v5 = _mm256_unpackhi_epi32(u0, u2); - const __m256i v6 = _mm256_unpacklo_epi32(u1, u3); - const __m256i v7 = _mm256_unpackhi_epi32(u1, u3); - - in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0); - in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3); - in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0); - in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3); - in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0); - in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3); - in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0); - in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3); -} - -// Note: -// For 16x16 int16_t matrix -// transpose first 8 columns into first 8 rows. -// Since only matrix left 8x16 are non-zero, the input are total 16 rows -// (in[16]). -// After transposing, the 8 row vectors are in in[8]. All else are zero. -static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) { - transpose_col_to_row_nz8x8(in); - transpose_col_to_row_nz8x8(&in[8]); - - int i; - for (i = 0; i < 8; ++i) { - in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); - } -} - -static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) { - const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28); - __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04); - - const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20); - __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12); - - const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24); - const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08); - - const __m256i u4 = _mm256_add_epi16(t4, t5); - const __m256i u5 = _mm256_sub_epi16(t4, t5); - const __m256i u6 = _mm256_sub_epi16(t7, t6); - const __m256i u7 = _mm256_add_epi16(t7, t6); - - const __m256i t0 = _mm256_add_epi16(u0, u3); - const __m256i t1 = _mm256_add_epi16(u1, u2); - const __m256i t2 = _mm256_sub_epi16(u1, u2); - const __m256i t3 = _mm256_sub_epi16(u0, u3); - - t4 = u4; - t7 = u7; - - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6); - - out[0] = _mm256_add_epi16(t0, t7); - out[1] = _mm256_add_epi16(t1, t6); - out[2] = _mm256_add_epi16(t2, t5); - out[3] = _mm256_add_epi16(t3, t4); - out[4] = _mm256_sub_epi16(t3, t4); - out[5] = _mm256_sub_epi16(t2, t5); - out[6] = _mm256_sub_epi16(t1, t6); - out[7] = _mm256_sub_epi16(t0, t7); -} - -static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) { - const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); - __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); - - const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18); - __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14); - - const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22); - __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10); - - const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); - __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); - - __m256i v0, v1, v2, v3, v4, v5, v6, v7; - v0 = _mm256_add_epi16(t0, t1); - v1 = _mm256_sub_epi16(t0, t1); - v2 = _mm256_sub_epi16(t3, t2); - v3 = _mm256_add_epi16(t2, t3); - v4 = _mm256_add_epi16(t4, t5); - v5 = _mm256_sub_epi16(t4, t5); - v6 = _mm256_sub_epi16(t7, t6); - v7 = _mm256_add_epi16(t6, t7); - - t0 = v0; - t7 = v7; - t3 = v3; - t4 = v4; - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); - unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); - - v0 = _mm256_add_epi16(t0, t3); - v1 = _mm256_add_epi16(t1, t2); - v2 = _mm256_sub_epi16(t1, t2); - v3 = _mm256_sub_epi16(t0, t3); - v4 = _mm256_sub_epi16(t7, t4); - v5 = _mm256_sub_epi16(t6, t5); - v6 = _mm256_add_epi16(t6, t5); - v7 = _mm256_add_epi16(t7, t4); - - // stage 6, (8-15) - out[0] = v0; - out[1] = v1; - out[6] = v6; - out[7] = v7; - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); - unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); -} - -static INLINE void idct16_38(__m256i *in /*in[16]*/) { - __m256i out[16]; - idct16_38_first_half(in, out); - idct16_38_second_half(in, &out[8]); - add_sub_butterfly(out, in, 16); -} - -void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - - int i; - for (i = 0; i < 8; ++i) { - load_coeff(input + (i << 4), &in[i]); - } - - transpose_col_to_row_nz8x8(in); - idct16_38(in); - - transpose_col_to_row_nz8x16(in); - idct16_38(in); - - store_buffer_16xN(in, stride, dest, 16); -} - -static INLINE int calculate_dc(const tran_low_t *input) { - int dc = (int)dct_const_round_shift(input[0] * cospi_16_64); - dc = (int)dct_const_round_shift(dc * cospi_16_64); - dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS); - return dc; -} - -void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - const int dc = calculate_dc(input); - if (dc == 0) return; - - const __m256i dc_value = _mm256_set1_epi16(dc); - - int i; - for (i = 0; i < 16; ++i) { - recon_and_store(&dc_value, dest); - dest += stride; - } -} - -// ----------------------------------------------------------------------------- -// 32x32 partial IDCT - -void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - const int dc = calculate_dc(input); - if (dc == 0) return; - - const __m256i dc_value = _mm256_set1_epi16(dc); - - int i; - for (i = 0; i < 32; ++i) { - recon_and_store(&dc_value, dest); - recon_and_store(&dc_value, dest + 16); - dest += stride; - } -} - -static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) { - int i; - for (i = 0; i < 16; ++i) { - load_coeff(input, &in[i]); - load_coeff(input + 16, &in[i + 16]); - input += 32; - } -} - -// Note: -// We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we -// operate coefficients on __m256i. Our operation capacity doubles for each -// instruction. -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm256_madd_epi16(x0, co0); \ - tmp1 = _mm256_madd_epi16(x1, co0); \ - tmp2 = _mm256_madd_epi16(x0, co1); \ - tmp3 = _mm256_madd_epi16(x1, co1); \ - tmp0 = _mm256_add_epi32(tmp0, rounding); \ - tmp1 = _mm256_add_epi32(tmp1, rounding); \ - tmp2 = _mm256_add_epi32(tmp2, rounding); \ - tmp3 = _mm256_add_epi32(tmp3, rounding); \ - tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m256i *x0, const __m256i *x1, - const __m256i *c0, const __m256i *c1, __m256i *y0, - __m256i *y1) { - __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm256_unpacklo_epi16(*x0, *x1); - u1 = _mm256_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm256_packs_epi32(tmp0, tmp1); - *y1 = _mm256_packs_epi32(tmp2, tmp3); -} - -static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0, - const __m256i *c1) { - __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm256_unpacklo_epi16(*x0, *x1); - u1 = _mm256_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm256_packs_epi32(tmp0, tmp1); - *x1 = _mm256_packs_epi32(tmp2, tmp3); -} - -// For each 16x32 block __m256i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m256i in[32] -static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/, - __m256i *out /*out[16]*/) { - __m256i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m256i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64); - const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64); - const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64); - const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm256_add_epi16(u8, u9); - v9 = _mm256_sub_epi16(u8, u9); - v14 = _mm256_sub_epi16(u15, u14); - v15 = _mm256_add_epi16(u15, u14); - - { - const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64); - const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64); - const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64); - const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm256_sub_epi16(u11, u10); - v11 = _mm256_add_epi16(u11, u10); - v12 = _mm256_add_epi16(u12, u13); - v13 = _mm256_sub_epi16(u12, u13); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(v8, v11); - out[1] = _mm256_add_epi16(v9, v10); - out[6] = _mm256_add_epi16(v14, v13); - out[7] = _mm256_add_epi16(v15, v12); - - out[2] = _mm256_sub_epi16(v9, v10); - out[3] = _mm256_sub_epi16(v8, v11); - out[4] = _mm256_sub_epi16(v15, v12); - out[5] = _mm256_sub_epi16(v14, v13); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m256i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m256i in[32] -static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/, - __m256i *out /*out[8]*/) { - __m256i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m256i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64); - const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64); - const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64); - const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm256_add_epi16(u4, u5); - v5 = _mm256_sub_epi16(u4, u5); - v6 = _mm256_sub_epi16(u7, u6); - v7 = _mm256_add_epi16(u7, u6); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64); - const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm256_add_epi16(u0, u3); - v1 = _mm256_add_epi16(u1, u2); - v2 = _mm256_sub_epi16(u1, u2); - v3 = _mm256_sub_epi16(u0, u3); - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v2, v5); - out[3] = _mm256_add_epi16(v3, v4); - out[4] = _mm256_sub_epi16(v3, v4); - out[5] = _mm256_sub_epi16(v2, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -// For each 8x32 block __m256i in[32], -// Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m256i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/, - __m256i *out /*out[16]*/) { - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64); - const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64); - const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64); - const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64); - const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64); - const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64); - const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64); - const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64); - const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64); - const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64); - const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64); - const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64); - const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64); - const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64); - const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64); - const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm256_add_epi16(u16, u17); - v17 = _mm256_sub_epi16(u16, u17); - v18 = _mm256_sub_epi16(u19, u18); - v19 = _mm256_add_epi16(u19, u18); - - v20 = _mm256_add_epi16(u20, u21); - v21 = _mm256_sub_epi16(u20, u21); - v22 = _mm256_sub_epi16(u23, u22); - v23 = _mm256_add_epi16(u23, u22); - - v24 = _mm256_add_epi16(u24, u25); - v25 = _mm256_sub_epi16(u24, u25); - v26 = _mm256_sub_epi16(u27, u26); - v27 = _mm256_add_epi16(u27, u26); - - v28 = _mm256_add_epi16(u28, u29); - v29 = _mm256_sub_epi16(u28, u29); - v30 = _mm256_sub_epi16(u31, u30); - v31 = _mm256_add_epi16(u31, u30); - - { - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - - u24 = _mm256_add_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u27 = _mm256_sub_epi16(v24, v27); - - u28 = _mm256_sub_epi16(v31, v28); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - u31 = _mm256_add_epi16(v28, v31); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(u16, u23); - out[1] = _mm256_add_epi16(u17, u22); - out[2] = _mm256_add_epi16(u18, u21); - out[3] = _mm256_add_epi16(u19, u20); - out[4] = _mm256_sub_epi16(u19, u20); - out[5] = _mm256_sub_epi16(u18, u21); - out[6] = _mm256_sub_epi16(u17, u22); - out[7] = _mm256_sub_epi16(u16, u23); - - out[8] = _mm256_sub_epi16(u31, u24); - out[9] = _mm256_sub_epi16(u30, u25); - out[10] = _mm256_sub_epi16(u29, u26); - out[11] = _mm256_sub_epi16(u28, u27); - out[12] = _mm256_add_epi16(u27, u28); - out[13] = _mm256_add_epi16(u26, u29); - out[14] = _mm256_add_epi16(u25, u30); - out[15] = _mm256_add_epi16(u24, u31); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/, - __m256i *out /*out[32]*/) { - __m256i temp[16]; - idct32_full_16x32_quarter_1(in, temp); - idct32_full_16x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -static void idct32_16x32(const __m256i *in /*in[32]*/, - __m256i *out /*out[32]*/) { - __m256i temp[32]; - idct32_full_16x32_quarter_1_2(in, temp); - idct32_full_16x32_quarter_3_4(in, &temp[16]); - add_sub_butterfly(temp, out, 32); -} - -void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i col[64], in[32]; - int i; - - for (i = 0; i < 2; ++i) { - load_buffer_32x16(input, in); - input += 32 << 4; - - mm256_transpose_16x16(in, in); - mm256_transpose_16x16(&in[16], &in[16]); - idct32_16x32(in, col + (i << 5)); - } - - for (i = 0; i < 2; ++i) { - int j = i << 4; - mm256_transpose_16x16(col + j, in); - mm256_transpose_16x16(col + j + 32, &in[16]); - idct32_16x32(in, in); - store_buffer_16xN(in, stride, dest, 32); - dest += 16; - } -} - -// Group the coefficient calculation into smaller functions -// to prevent stack spillover: -// quarter_1: 0-7 -// quarter_2: 8-15 -// quarter_3_4: 16-23, 24-31 -static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/, - __m256i *out /*out[8]*/) { - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i v0, v1, v2, v3, v4, v5, v6, v7; - - { - const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - u0 = _mm256_mulhrs_epi16(in[0], stk4_0); - u2 = _mm256_mulhrs_epi16(in[8], stk4_2); - u3 = _mm256_mulhrs_epi16(in[8], stk4_3); - u1 = u0; - } - - v0 = _mm256_add_epi16(u0, u3); - v1 = _mm256_add_epi16(u1, u2); - v2 = _mm256_sub_epi16(u1, u2); - v3 = _mm256_sub_epi16(u0, u3); - - { - const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m256i stk3_2 = - pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - u4 = _mm256_mulhrs_epi16(in[4], stk3_0); - u7 = _mm256_mulhrs_epi16(in[4], stk3_1); - u5 = _mm256_mulhrs_epi16(in[12], stk3_2); - u6 = _mm256_mulhrs_epi16(in[12], stk3_3); - } - - v4 = _mm256_add_epi16(u4, u5); - v5 = _mm256_sub_epi16(u4, u5); - v6 = _mm256_sub_epi16(u7, u6); - v7 = _mm256_add_epi16(u7, u6); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - } - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v2, v5); - out[3] = _mm256_add_epi16(v3, v4); - out[4] = _mm256_sub_epi16(v3, v4); - out[5] = _mm256_sub_epi16(v2, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/, - __m256i *out /*out[8]*/) { - __m256i u8, u9, u10, u11, u12, u13, u14, u15; - __m256i v8, v9, v10, v11, v12, v13, v14, v15; - - { - const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i stk2_2 = - pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - const __m256i stk2_6 = - pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - u8 = _mm256_mulhrs_epi16(in[2], stk2_0); - u15 = _mm256_mulhrs_epi16(in[2], stk2_1); - u9 = _mm256_mulhrs_epi16(in[14], stk2_2); - u14 = _mm256_mulhrs_epi16(in[14], stk2_3); - u10 = _mm256_mulhrs_epi16(in[10], stk2_4); - u13 = _mm256_mulhrs_epi16(in[10], stk2_5); - u11 = _mm256_mulhrs_epi16(in[6], stk2_6); - u12 = _mm256_mulhrs_epi16(in[6], stk2_7); - } - - v8 = _mm256_add_epi16(u8, u9); - v9 = _mm256_sub_epi16(u8, u9); - v10 = _mm256_sub_epi16(u11, u10); - v11 = _mm256_add_epi16(u11, u10); - v12 = _mm256_add_epi16(u12, u13); - v13 = _mm256_sub_epi16(u12, u13); - v14 = _mm256_sub_epi16(u15, u14); - v15 = _mm256_add_epi16(u15, u14); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(v8, v11); - out[1] = _mm256_add_epi16(v9, v10); - out[2] = _mm256_sub_epi16(v9, v10); - out[3] = _mm256_sub_epi16(v8, v11); - out[4] = _mm256_sub_epi16(v15, v12); - out[5] = _mm256_sub_epi16(v14, v13); - out[6] = _mm256_add_epi16(v14, v13); - out[7] = _mm256_add_epi16(v15, v12); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// 8x32 block even indexed 8 inputs of in[16], -// output first half 16 to out[32] -static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/, - __m256i *out /*out[32]*/) { - __m256i temp[16]; - idct32_16x32_135_quarter_1(in, temp); - idct32_16x32_135_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -// 8x32 block odd indexed 8 inputs of in[16], -// output second half 16 to out[32] -static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/, - __m256i *out /*out[32]*/) { - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m256i stk1_2 = - pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); - const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); - - const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); - const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); - const __m256i stk1_6 = - pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m256i stk1_10 = - pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); - const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); - - const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); - const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); - const __m256i stk1_14 = - pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - u16 = _mm256_mulhrs_epi16(in[1], stk1_0); - u31 = _mm256_mulhrs_epi16(in[1], stk1_1); - u17 = _mm256_mulhrs_epi16(in[15], stk1_2); - u30 = _mm256_mulhrs_epi16(in[15], stk1_3); - - u18 = _mm256_mulhrs_epi16(in[9], stk1_4); - u29 = _mm256_mulhrs_epi16(in[9], stk1_5); - u19 = _mm256_mulhrs_epi16(in[7], stk1_6); - u28 = _mm256_mulhrs_epi16(in[7], stk1_7); - - u20 = _mm256_mulhrs_epi16(in[5], stk1_8); - u27 = _mm256_mulhrs_epi16(in[5], stk1_9); - u21 = _mm256_mulhrs_epi16(in[11], stk1_10); - u26 = _mm256_mulhrs_epi16(in[11], stk1_11); - - u22 = _mm256_mulhrs_epi16(in[13], stk1_12); - u25 = _mm256_mulhrs_epi16(in[13], stk1_13); - u23 = _mm256_mulhrs_epi16(in[3], stk1_14); - u24 = _mm256_mulhrs_epi16(in[3], stk1_15); - } - - v16 = _mm256_add_epi16(u16, u17); - v17 = _mm256_sub_epi16(u16, u17); - v18 = _mm256_sub_epi16(u19, u18); - v19 = _mm256_add_epi16(u19, u18); - - v20 = _mm256_add_epi16(u20, u21); - v21 = _mm256_sub_epi16(u20, u21); - v22 = _mm256_sub_epi16(u23, u22); - v23 = _mm256_add_epi16(u23, u22); - - v24 = _mm256_add_epi16(u24, u25); - v25 = _mm256_sub_epi16(u24, u25); - v26 = _mm256_sub_epi16(u27, u26); - v27 = _mm256_add_epi16(u27, u26); - - v28 = _mm256_add_epi16(u28, u29); - v29 = _mm256_sub_epi16(u28, u29); - v30 = _mm256_sub_epi16(u31, u30); - v31 = _mm256_add_epi16(u31, u30); - - { - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - - u24 = _mm256_add_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u27 = _mm256_sub_epi16(v24, v27); - u28 = _mm256_sub_epi16(v31, v28); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - u31 = _mm256_add_epi16(v28, v31); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(u16, u23); - out[1] = _mm256_add_epi16(u17, u22); - out[2] = _mm256_add_epi16(u18, u21); - out[3] = _mm256_add_epi16(u19, u20); - v20 = _mm256_sub_epi16(u19, u20); - v21 = _mm256_sub_epi16(u18, u21); - v22 = _mm256_sub_epi16(u17, u22); - v23 = _mm256_sub_epi16(u16, u23); - - v24 = _mm256_sub_epi16(u31, u24); - v25 = _mm256_sub_epi16(u30, u25); - v26 = _mm256_sub_epi16(u29, u26); - v27 = _mm256_sub_epi16(u28, u27); - out[12] = _mm256_add_epi16(u27, u28); - out[13] = _mm256_add_epi16(u26, u29); - out[14] = _mm256_add_epi16(u25, u30); - out[15] = _mm256_add_epi16(u24, u31); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); - } -} - -// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] -static void idct32_16x32_135(__m256i *in /*in[32]*/) { - __m256i out[32]; - idct32_16x32_quarter_1_2(in, out); - idct32_16x32_quarter_3_4(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in, - int size) { - int i = 0; - while (i < size) { - load_coeff(coeff + (i << 5), &in[i]); - i += 1; - } -} - -static INLINE void zero_buffer(__m256i *in, int num) { - int i; - for (i = 0; i < num; ++i) { - in[i] = _mm256_setzero_si256(); - } -} - -// Only upper-left 16x16 has non-zero coeff -void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[32]; - zero_buffer(in, 32); - load_buffer_from_32x32(input, in, 16); - mm256_transpose_16x16(in, in); - idct32_16x32_135(in); - - __m256i out[32]; - mm256_transpose_16x16(in, out); - idct32_16x32_135(out); - store_buffer_16xN(out, stride, dest, 32); - mm256_transpose_16x16(&in[16], in); - idct32_16x32_135(in); - store_buffer_16xN(in, stride, dest + 16, 32); -} - -static void idct32_34_first_half(const __m256i *in, __m256i *stp1) { - const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - - const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i x0, x1, x4, x5, x6, x7; - __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - - // phase 1 - - // 0, 15 - u2 = _mm256_mulhrs_epi16(in[2], stk2_1); // stp2_15 - u3 = _mm256_mulhrs_epi16(in[6], stk2_7); // stp2_12 - v15 = _mm256_add_epi16(u2, u3); - // in[0], in[4] - x0 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[0] - x7 = _mm256_mulhrs_epi16(in[4], stk3_1); // stp1[7] - v0 = _mm256_add_epi16(x0, x7); // stp2_0 - stp1[0] = _mm256_add_epi16(v0, v15); - stp1[15] = _mm256_sub_epi16(v0, v15); - - // in[2], in[6] - u0 = _mm256_mulhrs_epi16(in[2], stk2_0); // stp2_8 - u1 = _mm256_mulhrs_epi16(in[6], stk2_6); // stp2_11 - butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 - butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 - - v8 = _mm256_add_epi16(u0, u1); - v9 = _mm256_add_epi16(u4, u6); - v10 = _mm256_sub_epi16(u4, u6); - v11 = _mm256_sub_epi16(u0, u1); - v12 = _mm256_sub_epi16(u2, u3); - v13 = _mm256_sub_epi16(u5, u7); - v14 = _mm256_add_epi16(u5, u7); - - butterfly_self(&v10, &v13, &stg6_0, &stg4_0); - butterfly_self(&v11, &v12, &stg6_0, &stg4_0); - - // 1, 14 - x1 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 - // stp1[2] = stp1[0], stp1[3] = stp1[1] - x4 = _mm256_mulhrs_epi16(in[4], stk3_0); // stp1[4] - butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); - v1 = _mm256_add_epi16(x1, x6); // stp2_1 - v2 = _mm256_add_epi16(x0, x5); // stp2_2 - stp1[1] = _mm256_add_epi16(v1, v14); - stp1[14] = _mm256_sub_epi16(v1, v14); - - stp1[2] = _mm256_add_epi16(v2, v13); - stp1[13] = _mm256_sub_epi16(v2, v13); - - v3 = _mm256_add_epi16(x1, x4); // stp2_3 - v4 = _mm256_sub_epi16(x1, x4); // stp2_4 - - v5 = _mm256_sub_epi16(x0, x5); // stp2_5 - - v6 = _mm256_sub_epi16(x1, x6); // stp2_6 - v7 = _mm256_sub_epi16(x0, x7); // stp2_7 - stp1[3] = _mm256_add_epi16(v3, v12); - stp1[12] = _mm256_sub_epi16(v3, v12); - - stp1[6] = _mm256_add_epi16(v6, v9); - stp1[9] = _mm256_sub_epi16(v6, v9); - - stp1[7] = _mm256_add_epi16(v7, v8); - stp1[8] = _mm256_sub_epi16(v7, v8); - - stp1[4] = _mm256_add_epi16(v4, v11); - stp1[11] = _mm256_sub_epi16(v4, v11); - - stp1[5] = _mm256_add_epi16(v5, v10); - stp1[10] = _mm256_sub_epi16(v5, v10); -} - -static void idct32_34_second_half(const __m256i *in, __m256i *stp1) { - const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - v16 = _mm256_mulhrs_epi16(in[1], stk1_0); - v31 = _mm256_mulhrs_epi16(in[1], stk1_1); - - v19 = _mm256_mulhrs_epi16(in[7], stk1_6); - v28 = _mm256_mulhrs_epi16(in[7], stk1_7); - - v20 = _mm256_mulhrs_epi16(in[5], stk1_8); - v27 = _mm256_mulhrs_epi16(in[5], stk1_9); - - v23 = _mm256_mulhrs_epi16(in[3], stk1_14); - v24 = _mm256_mulhrs_epi16(in[3], stk1_15); - - butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); - butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); - butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); - butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - u24 = _mm256_add_epi16(v24, v27); - u27 = _mm256_sub_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u28 = _mm256_sub_epi16(v31, v28); - u31 = _mm256_add_epi16(v28, v31); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - - stp1[0] = _mm256_add_epi16(u16, u23); - stp1[7] = _mm256_sub_epi16(u16, u23); - - stp1[1] = _mm256_add_epi16(u17, u22); - stp1[6] = _mm256_sub_epi16(u17, u22); - - stp1[2] = _mm256_add_epi16(u18, u21); - stp1[5] = _mm256_sub_epi16(u18, u21); - - stp1[3] = _mm256_add_epi16(u19, u20); - stp1[4] = _mm256_sub_epi16(u19, u20); - - stp1[8] = _mm256_sub_epi16(u31, u24); - stp1[15] = _mm256_add_epi16(u24, u31); - - stp1[9] = _mm256_sub_epi16(u30, u25); - stp1[14] = _mm256_add_epi16(u25, u30); - - stp1[10] = _mm256_sub_epi16(u29, u26); - stp1[13] = _mm256_add_epi16(u26, u29); - - stp1[11] = _mm256_sub_epi16(u28, u27); - stp1[12] = _mm256_add_epi16(u27, u28); - - butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0); - butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0); - butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0); - butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0); -} - -// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] -static void idct32_16x32_34(__m256i *in /*in[32]*/) { - __m256i out[32]; - idct32_34_first_half(in, out); - idct32_34_second_half(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[32]; - zero_buffer(in, 32); - load_buffer_from_32x32(input, in, 8); - mm256_transpose_16x16(in, in); - idct32_16x32_34(in); - - __m256i out[32]; - mm256_transpose_16x16(in, out); - idct32_16x32_34(out); - store_buffer_16xN(out, stride, dest, 32); - mm256_transpose_16x16(&in[16], in); - idct32_16x32_34(in); - store_buffer_16xN(in, stride, dest + 16, 32); -} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h deleted file mode 100644 index 26c5cfe59..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H -#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H - -#include <immintrin.h> - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { - if (sizeof(tran_low_t) == 4) { - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); - } else { - *in = _mm256_loadu_si256((const __m256i *)coeff); - } -} - -static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { - int i = 0; - while (i < 16) { - load_coeff(coeff + (i << 4), &in[i]); - i += 1; - } -} - -static INLINE void recon_and_store(const __m256i *res, uint8_t *output) { - const __m128i zero = _mm_setzero_si128(); - __m128i x = _mm_loadu_si128((__m128i const *)output); - __m128i p0 = _mm_unpacklo_epi8(x, zero); - __m128i p1 = _mm_unpackhi_epi8(x, zero); - - p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res)); - p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1)); - x = _mm_packus_epi16(p0, p1); - _mm_storeu_si128((__m128i *)output, x); -} - -#define IDCT_ROUNDING_POS (6) -static INLINE void store_buffer_16xN(__m256i *in, const int stride, - uint8_t *output, int num) { - const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1)); - int i = 0; - - while (i < num) { - in[i] = _mm256_adds_epi16(in[i], rounding); - in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS); - recon_and_store(&in[i], output + i * stride); - i += 1; - } -} - -static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1, - const __m256i *c0, const __m256i *c1, - __m256i *b0, __m256i *b1) { - __m256i x0, x1; - x0 = _mm256_unpacklo_epi16(*a0, *a1); - x1 = _mm256_unpackhi_epi16(*a0, *a1); - *b0 = butter_fly(&x0, &x1, c0); - *b1 = butter_fly(&x0, &x1, c1); -} - -void av1_idct16_avx2(__m256i *in); - -#endif // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c deleted file mode 100644 index 86ce928b7..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c +++ /dev/null @@ -1,3500 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/inv_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - -void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -void aom_idct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - - array_transpose_4x4(in); - // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); - - // stage 2 - in[0] = _mm_add_epi16(u[0], u[1]); - in[1] = _mm_sub_epi16(u[0], u[1]); - in[1] = _mm_shuffle_epi32(in[1], 0x4E); -} - -void aom_iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; - - array_transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); -} - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } - -void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from aom_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); -} - -void aom_idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // 8x8 Transpose is copied from aom_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); -} - -void aom_iadst8_sse2(__m128i *in) { - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); -} - -void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { - int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); - dest += stride; - } -} - -void iadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - -void aom_idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -void aom_iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); -} - -void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } - -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -#define IDCT32(in0, in1) \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32(in, in + 16) - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32(in, in + 16) - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); - } -} - -// Apply a 32-element IDCT to 8 columns. This does not do any transposition -// of its input - the caller is expected to have done that. -// The input buffers are the top and bottom halves of an 8x32 block. -void idct32_8col(__m128i *in0, __m128i *in1) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - IDCT32(in0, in1) - - // 2_D: Calculate the results and store them to destination. - in0[0] = _mm_add_epi16(stp1_0, stp1_31); - in0[1] = _mm_add_epi16(stp1_1, stp1_30); - in0[2] = _mm_add_epi16(stp1_2, stp1_29); - in0[3] = _mm_add_epi16(stp1_3, stp1_28); - in0[4] = _mm_add_epi16(stp1_4, stp1_27); - in0[5] = _mm_add_epi16(stp1_5, stp1_26); - in0[6] = _mm_add_epi16(stp1_6, stp1_25); - in0[7] = _mm_add_epi16(stp1_7, stp1_24); - in0[8] = _mm_add_epi16(stp1_8, stp1_23); - in0[9] = _mm_add_epi16(stp1_9, stp1_22); - in0[10] = _mm_add_epi16(stp1_10, stp1_21); - in0[11] = _mm_add_epi16(stp1_11, stp1_20); - in0[12] = _mm_add_epi16(stp1_12, stp1_19); - in0[13] = _mm_add_epi16(stp1_13, stp1_18); - in0[14] = _mm_add_epi16(stp1_14, stp1_17); - in0[15] = _mm_add_epi16(stp1_15, stp1_16); - in1[0] = _mm_sub_epi16(stp1_15, stp1_16); - in1[1] = _mm_sub_epi16(stp1_14, stp1_17); - in1[2] = _mm_sub_epi16(stp1_13, stp1_18); - in1[3] = _mm_sub_epi16(stp1_12, stp1_19); - in1[4] = _mm_sub_epi16(stp1_11, stp1_20); - in1[5] = _mm_sub_epi16(stp1_10, stp1_21); - in1[6] = _mm_sub_epi16(stp1_9, stp1_22); - in1[7] = _mm_sub_epi16(stp1_8, stp1_23); - in1[8] = _mm_sub_epi16(stp1_7, stp1_24); - in1[9] = _mm_sub_epi16(stp1_6, stp1_25); - in1[10] = _mm_sub_epi16(stp1_5, stp1_26); - in1[11] = _mm_sub_epi16(stp1_4, stp1_27); - in1[12] = _mm_sub_epi16(stp1_3, stp1_28); - in1[13] = _mm_sub_epi16(stp1_2, stp1_29); - in1[14] = _mm_sub_epi16(stp1_1, stp1_30); - in1[15] = _mm_sub_epi16(stp1_0, stp1_31); -} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h deleted file mode 100644 index 342816977..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ -#define AOM_DSP_X86_INV_TXFM_SSE2_H_ - -#include <emmintrin.h> // SSE2 -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// perform 8x8 transpose -static INLINE void array_transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); -} - -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -// Function to allow 8 bit optimisations to be used when profile 0 is used with -// highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { - if (sizeof(tran_low_t) == 4) { - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); - } else { - return _mm_load_si128((const __m128i *)data); - } -} - -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); -} - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); -} - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -void iadst16_8col(__m128i *in); -void idct16_8col(__m128i *in); -void aom_idct4_sse2(__m128i *in); -void aom_idct8_sse2(__m128i *in); -void aom_idct16_sse2(__m128i *in0, __m128i *in1); -void aom_iadst4_sse2(__m128i *in); -void aom_iadst8_sse2(__m128i *in); -void aom_iadst16_sse2(__m128i *in0, __m128i *in1); -void idct32_8col(__m128i *in0, __m128i *in1); - -#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c deleted file mode 100644 index 9d006797b..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c +++ /dev/null @@ -1,1333 +0,0 @@ -/* - * Copyright (c) 2017 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <tmmintrin.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/inv_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - { - /* Stage1 */ - { - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); - - { - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp1 = _mm_madd_epi16(hi_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp3 = _mm_madd_epi16(hi_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp5 = _mm_madd_epi16(hi_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - tmp7 = _mm_madd_epi16(hi_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - tmp4 = _mm_srai_epi32(tmp4, 14); - tmp5 = _mm_srai_epi32(tmp5, 14); - tmp6 = _mm_srai_epi32(tmp6, 14); - tmp7 = _mm_srai_epi32(tmp7, 14); - - stp1_4 = _mm_packs_epi32(tmp0, tmp1); - stp1_7 = _mm_packs_epi32(tmp2, tmp3); - stp1_5 = _mm_packs_epi32(tmp4, tmp5); - stp1_6 = _mm_packs_epi32(tmp6, tmp7); - } - } - - /* Stage2 */ - { - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); - - { - tmp0 = _mm_unpacklo_epi16(in0, in4); - tmp1 = _mm_unpackhi_epi16(in0, in4); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp4 = _mm_madd_epi16(tmp0, stk2_1); - tmp5 = _mm_madd_epi16(tmp1, stk2_1); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp2, tmp3); - stp2_1 = _mm_packs_epi32(tmp4, tmp5); - - tmp0 = _mm_madd_epi16(lo_26, stg2_2); - tmp1 = _mm_madd_epi16(hi_26, stg2_2); - tmp2 = _mm_madd_epi16(lo_26, stg2_3); - tmp3 = _mm_madd_epi16(hi_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - - stp2_2 = _mm_packs_epi32(tmp0, tmp1); - stp2_3 = _mm_packs_epi32(tmp2, tmp3); - } - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - } - - /* Stage3 */ - { - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp4 = _mm_madd_epi16(tmp0, stk2_0); - tmp5 = _mm_madd_epi16(tmp1, stk2_0); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - stp1_6 = _mm_packs_epi32(tmp4, tmp5); - } - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - } - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - - // Stage1 - tmp0 = _mm_mulhrs_epi16(in0, stg1_0); - tmp1 = _mm_mulhrs_epi16(in0, stg1_1); - tmp2 = _mm_mulhrs_epi16(in1, stg1_2); - tmp3 = _mm_mulhrs_epi16(in1, stg1_3); - - stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); - stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); - - // Stage2 - tmp0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); - - tmp1 = _mm_mulhrs_epi16(in1, stg2_2); - tmp2 = _mm_mulhrs_epi16(in1, stg2_3); - stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); - - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - - tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp1 = _mm_madd_epi16(tmp0, stg3_0); - tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp1, tmp2); - - // Stage3 - tmp2 = _mm_add_epi16(stp2_0, stp2_2); - tmp3 = _mm_sub_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); - stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); - - // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - /* Stage1 */ - stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); - stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); - stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); - stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); - - /* Stage2 */ - stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); - - stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); - stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - /* Stage3 */ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -// Only do addition and subtraction butterfly, size = 16, 32 -static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); - i++; - } -} - -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm_madd_epi16(x0, co0); \ - tmp1 = _mm_madd_epi16(x1, co0); \ - tmp2 = _mm_madd_epi16(x0, co1); \ - tmp3 = _mm_madd_epi16(x1, co1); \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m128i *x0, const __m128i *x1, - const __m128i *c0, const __m128i *c1, __m128i *y0, - __m128i *y1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm_packs_epi32(tmp0, tmp1); - *y1 = _mm_packs_epi32(tmp2, tmp3); -} - -static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, - const __m128i *c1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm_packs_epi32(tmp0, tmp1); - *x1 = _mm_packs_epi32(tmp2, tmp3); -} - -static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i x0, x1, x4, x5, x6, x7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - - // phase 1 - - // 0, 15 - u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 - u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 - v15 = _mm_add_epi16(u2, u3); - // in[0], in[4] - x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] - x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] - v0 = _mm_add_epi16(x0, x7); // stp2_0 - stp1[0] = _mm_add_epi16(v0, v15); - stp1[15] = _mm_sub_epi16(v0, v15); - - // in[2], in[6] - u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 - u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 - butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 - butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 - - v8 = _mm_add_epi16(u0, u1); - v9 = _mm_add_epi16(u4, u6); - v10 = _mm_sub_epi16(u4, u6); - v11 = _mm_sub_epi16(u0, u1); - v12 = _mm_sub_epi16(u2, u3); - v13 = _mm_sub_epi16(u5, u7); - v14 = _mm_add_epi16(u5, u7); - - butterfly_self(&v10, &v13, &stg6_0, &stg4_0); - butterfly_self(&v11, &v12, &stg6_0, &stg4_0); - - // 1, 14 - x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 - // stp1[2] = stp1[0], stp1[3] = stp1[1] - x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] - butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); - v1 = _mm_add_epi16(x1, x6); // stp2_1 - v2 = _mm_add_epi16(x0, x5); // stp2_2 - stp1[1] = _mm_add_epi16(v1, v14); - stp1[14] = _mm_sub_epi16(v1, v14); - - stp1[2] = _mm_add_epi16(v2, v13); - stp1[13] = _mm_sub_epi16(v2, v13); - - v3 = _mm_add_epi16(x1, x4); // stp2_3 - v4 = _mm_sub_epi16(x1, x4); // stp2_4 - - v5 = _mm_sub_epi16(x0, x5); // stp2_5 - - v6 = _mm_sub_epi16(x1, x6); // stp2_6 - v7 = _mm_sub_epi16(x0, x7); // stp2_7 - stp1[3] = _mm_add_epi16(v3, v12); - stp1[12] = _mm_sub_epi16(v3, v12); - - stp1[6] = _mm_add_epi16(v6, v9); - stp1[9] = _mm_sub_epi16(v6, v9); - - stp1[7] = _mm_add_epi16(v7, v8); - stp1[8] = _mm_sub_epi16(v7, v8); - - stp1[4] = _mm_add_epi16(v4, v11); - stp1[11] = _mm_sub_epi16(v4, v11); - - stp1[5] = _mm_add_epi16(v5, v10); - stp1[10] = _mm_sub_epi16(v5, v10); -} - -static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - v16 = _mm_mulhrs_epi16(in[1], stk1_0); - v31 = _mm_mulhrs_epi16(in[1], stk1_1); - - v19 = _mm_mulhrs_epi16(in[7], stk1_6); - v28 = _mm_mulhrs_epi16(in[7], stk1_7); - - v20 = _mm_mulhrs_epi16(in[5], stk1_8); - v27 = _mm_mulhrs_epi16(in[5], stk1_9); - - v23 = _mm_mulhrs_epi16(in[3], stk1_14); - v24 = _mm_mulhrs_epi16(in[3], stk1_15); - - butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); - butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); - butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); - butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - u24 = _mm_add_epi16(v24, v27); - u27 = _mm_sub_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u28 = _mm_sub_epi16(v31, v28); - u31 = _mm_add_epi16(v28, v31); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - - stp1[16] = _mm_add_epi16(u16, u23); - stp1[23] = _mm_sub_epi16(u16, u23); - - stp1[17] = _mm_add_epi16(u17, u22); - stp1[22] = _mm_sub_epi16(u17, u22); - - stp1[18] = _mm_add_epi16(u18, u21); - stp1[21] = _mm_sub_epi16(u18, u21); - - stp1[19] = _mm_add_epi16(u19, u20); - stp1[20] = _mm_sub_epi16(u19, u20); - - stp1[24] = _mm_sub_epi16(u31, u24); - stp1[31] = _mm_add_epi16(u24, u31); - - stp1[25] = _mm_sub_epi16(u30, u25); - stp1[30] = _mm_add_epi16(u25, u30); - - stp1[26] = _mm_sub_epi16(u29, u26); - stp1[29] = _mm_add_epi16(u26, u29); - - stp1[27] = _mm_sub_epi16(u28, u27); - stp1[28] = _mm_add_epi16(u27, u28); - - butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); - butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); - butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); - butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); -} - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - __m128i in[32], col[32]; - __m128i stp1[32]; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - array_transpose_8x8(in, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); - - // 1_D: Store 32 intermediate results for each 8x32 block. - add_sub_butterfly(stp1, col, 32); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); - - // 2_D: Calculate the results and store them to destination. - add_sub_butterfly(stp1, in, 32); - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -// in0[16] represents the left 8x16 block -// in1[16] represents the right 8x16 block -static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, - __m128i *in1) { - int i; - for (i = 0; i < 16; i++) { - in0[i] = load_input_data(input); - in1[i] = load_input_data(input + 8); - input += 32; - } -} - -static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, - __m128i *out1) { - array_transpose_8x8(in0, out0); - array_transpose_8x8(&in0[8], out1); - array_transpose_8x8(in1, &out0[8]); - array_transpose_8x8(&in1[8], &out1[8]); -} - -// Group the coefficient calculation into smaller functions -// to prevent stack spillover: -// quarter_1: 0-7 -// quarter_2: 8-15 -// quarter_3_4: 16-23, 24-31 -static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - - { - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - u0 = _mm_mulhrs_epi16(in[0], stk4_0); - u2 = _mm_mulhrs_epi16(in[8], stk4_2); - u3 = _mm_mulhrs_epi16(in[8], stk4_3); - u1 = u0; - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - { - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - u4 = _mm_mulhrs_epi16(in[4], stk3_0); - u7 = _mm_mulhrs_epi16(in[4], stk3_1); - u5 = _mm_mulhrs_epi16(in[12], stk3_2); - u6 = _mm_mulhrs_epi16(in[12], stk3_3); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - } - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - -static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v8, v9, v10, v11, v12, v13, v14, v15; - - { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - u8 = _mm_mulhrs_epi16(in[2], stk2_0); - u15 = _mm_mulhrs_epi16(in[2], stk2_1); - u9 = _mm_mulhrs_epi16(in[14], stk2_2); - u14 = _mm_mulhrs_epi16(in[14], stk2_3); - u10 = _mm_mulhrs_epi16(in[10], stk2_4); - u13 = _mm_mulhrs_epi16(in[10], stk2_5); - u11 = _mm_mulhrs_epi16(in[6], stk2_6); - u12 = _mm_mulhrs_epi16(in[6], stk2_7); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// 8x32 block even indexed 8 inputs of in[16], -// output first half 16 to out[32] -static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_8x32_135_quarter_1(in, temp); - idct32_8x32_135_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -// 8x32 block odd indexed 8 inputs of in[16], -// output second half 16 to out[32] -static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); - const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); - - const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); - const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); - const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); - - const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); - const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - u16 = _mm_mulhrs_epi16(in[1], stk1_0); - u31 = _mm_mulhrs_epi16(in[1], stk1_1); - u17 = _mm_mulhrs_epi16(in[15], stk1_2); - u30 = _mm_mulhrs_epi16(in[15], stk1_3); - - u18 = _mm_mulhrs_epi16(in[9], stk1_4); - u29 = _mm_mulhrs_epi16(in[9], stk1_5); - u19 = _mm_mulhrs_epi16(in[7], stk1_6); - u28 = _mm_mulhrs_epi16(in[7], stk1_7); - - u20 = _mm_mulhrs_epi16(in[5], stk1_8); - u27 = _mm_mulhrs_epi16(in[5], stk1_9); - u21 = _mm_mulhrs_epi16(in[11], stk1_10); - u26 = _mm_mulhrs_epi16(in[11], stk1_11); - - u22 = _mm_mulhrs_epi16(in[13], stk1_12); - u25 = _mm_mulhrs_epi16(in[13], stk1_13); - u23 = _mm_mulhrs_epi16(in[3], stk1_14); - u24 = _mm_mulhrs_epi16(in[3], stk1_15); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - v20 = _mm_sub_epi16(u19, u20); - v21 = _mm_sub_epi16(u18, u21); - v22 = _mm_sub_epi16(u17, u22); - v23 = _mm_sub_epi16(u16, u23); - - v24 = _mm_sub_epi16(u31, u24); - v25 = _mm_sub_epi16(u30, u25); - v26 = _mm_sub_epi16(u29, u26); - v27 = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); - } -} - -// 8x16 block, input __m128i in[16], output __m128i in[32] -static void idct32_8x32_135(__m128i *in /*in[32]*/) { - __m128i out[32]; - idct32_8x32_quarter_1_2(in, out); - idct32_8x32_quarter_3_4(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - int j = 0; - while (j < 32) { - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); - - in[j] = _mm_srai_epi16(in[j], 6); - in[j + 1] = _mm_srai_epi16(in[j + 1], 6); - - RECON_AND_STORE(dst, in[j]); - dst += stride; - RECON_AND_STORE(dst, in[j + 1]); - dst += stride; - j += 2; - } -} - -static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, - int stride) { - store_buffer_8x32(in0, dest, stride); - store_buffer_8x32(in1, dest + 8, stride); -} - -static INLINE void idct32_135(__m128i *col0, __m128i *col1) { - idct32_8x32_135(col0); - idct32_8x32_135(col1); -} - -typedef enum { left_16, right_16 } ColsIndicator; - -static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, - ColsIndicator cols) { - switch (cols) { - case left_16: { - int i; - array_transpose_16x16(in0, in1); - for (i = 0; i < 16; ++i) { - store[i] = in0[16 + i]; - store[16 + i] = in1[16 + i]; - } - break; - } - case right_16: { - array_transpose_16x16_2(store, &store[16], in0, in1); - break; - } - default: { assert(0); } - } -} - -// Only upper-left 16x16 has non-zero coeff -void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - // Each array represents an 8x32 block - __m128i col0[32], col1[32]; - // This array represents a 16x16 block - __m128i temp[32]; - - // Load input data. Only need to load the top left 16x16 block. - load_buffer_16x16(input, col0, col1); - - // columns - array_transpose_16x16(col0, col1); - idct32_135(col0, col1); - - // rows - transpose_and_copy_16x16(col0, col1, temp, left_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest, stride); - - transpose_and_copy_16x16(col0, col1, temp, right_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest + 16, stride); -} - -// For each 8x32 block __m128i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m128i in[32] -static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m128i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m128i in[32] -static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - -// For each 8x32 block __m128i in[32], -// Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m128i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - out[4] = _mm_sub_epi16(u19, u20); - out[5] = _mm_sub_epi16(u18, u21); - out[6] = _mm_sub_epi16(u17, u22); - out[7] = _mm_sub_epi16(u16, u23); - - out[8] = _mm_sub_epi16(u31, u24); - out[9] = _mm_sub_epi16(u30, u25); - out[10] = _mm_sub_epi16(u29, u26); - out[11] = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_full_8x32_quarter_1(in, temp); - idct32_full_8x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -static void idct32_full_8x32(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[32]; - idct32_full_8x32_quarter_1_2(in, temp); - idct32_full_8x32_quarter_3_4(in, &temp[16]); - add_sub_butterfly(temp, out, 32); -} - -static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { - int i; - for (i = 0; i < 8; ++i) { - in[i] = load_input_data(input); - in[i + 8] = load_input_data(input + 8); - in[i + 16] = load_input_data(input + 16); - in[i + 24] = load_input_data(input + 24); - input += 32; - } -} - -void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i col[128], in[32]; - int i, j; - - // rows - for (i = 0; i < 4; ++i) { - load_buffer_8x32(input, in); - input += 32 << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - idct32_full_8x32(in, col + (i << 5)); - } - - // columns - for (i = 0; i < 4; ++i) { - j = i << 3; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - idct32_full_8x32(in, in); - store_buffer_8x32(in, dest, stride); - dest += 8; - } -} diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm index f0668e6f3..0bc841a7a 100644 --- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm +++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm @@ -85,15 +85,10 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_HIGHBITDEPTH mova m0, [inputq + 0] packssdw m0, [inputq + 16] mova m1, [inputq + 32] packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif psraw m0, 2 psraw m1, 2 diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c new file mode 100644 index 000000000..c3c88245a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 4); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 4) { + __m128i x0 = xx_loadl_32(a + 0 * a_stride); + __m128i x1 = xx_loadl_32(a + 1 * a_stride); + __m128i x2 = xx_loadl_32(a + 2 * a_stride); + __m128i x3 = xx_loadl_32(a + 3 * a_stride); + __m128i x_lo = _mm_unpacklo_epi32(x0, x1); + __m128i x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); + + x0 = xx_loadl_32(b + 0 * b_stride); + x1 = xx_loadl_32(b + 1 * b_stride); + x2 = xx_loadl_32(b + 2 * b_stride); + x3 = xx_loadl_32(b + 3 * b_stride); + x_lo = _mm_unpacklo_epi32(x0, x1); + x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); + + __m128i sad4x4 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad4x4); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 8); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 2) { + __m128i x0 = xx_loadl_64(a + 0 * a_stride); + __m128i x1 = xx_loadl_64(a + 1 * a_stride); + + __m128i x = _mm_unpacklo_epi64(x0, x1); + + x0 = xx_loadl_64(b + 0 * b_stride); + x1 = xx_loadl_64(b + 1 * b_stride); + + __m128i y = _mm_unpacklo_epi64(x0, x1); + + __m128i sad8x2 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad8x2); + + a += 2 * a_stride; + b += 2 * b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 16); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + __m128i x = xx_loadu_128(a); + __m128i y = xx_loadu_128(b); + + __m128i sad16x1 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad16x1); + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 32); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 2; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad32_half = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad32_half); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 64); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 4; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 128); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 8; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +#define jnt_sadMxN_sse2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ + } + +#define jnt_sadMxN_avx2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ + } + +/* clang-format off */ +jnt_sadMxN_sse2(128, 128) +jnt_sadMxN_sse2(128, 64) +jnt_sadMxN_sse2(64, 128) +jnt_sadMxN_sse2(64, 64) +jnt_sadMxN_sse2(64, 32) +jnt_sadMxN_sse2(32, 64) +jnt_sadMxN_sse2(32, 32) +jnt_sadMxN_sse2(32, 16) +jnt_sadMxN_sse2(16, 32) +jnt_sadMxN_sse2(16, 16) +jnt_sadMxN_sse2(16, 8) +jnt_sadMxN_sse2(8, 16) +jnt_sadMxN_sse2(8, 8) +jnt_sadMxN_sse2(8, 4) +jnt_sadMxN_sse2(4, 8) +jnt_sadMxN_sse2(4, 4) +jnt_sadMxN_sse2(4, 16) +jnt_sadMxN_sse2(16, 4) +jnt_sadMxN_sse2(8, 32) +jnt_sadMxN_sse2(32, 8) +jnt_sadMxN_sse2(16, 64) +jnt_sadMxN_sse2(64, 16) + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c new file mode 100644 index 000000000..9801e285c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = _mm_setzero_si128(); + + // avoid load undefined memory + if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8); + __m128i source = _mm_unpacklo_epi64(source_low, source_hi); + + // shuffle to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} + +static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + if (width >= 16) { + // Read 16 pixels one row at a time + assert(!(width & 15)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 16) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + // Read 8 pixels two row at a time + assert(!(width & 7)); + assert(!(width & 1)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 2 * ref_stride; + } + } else { + // Read 4 pixels four row at a time + assert(!(width & 3)); + assert(!(height & 3)); + for (i = 0; i < height; i += 4) { + const uint8_t *row0 = ref + 0 * ref_stride; + const uint8_t *row1 = ref + 1 * ref_stride; + const uint8_t *row2 = ref + 2 * ref_stride; + const uint8_t *row3 = ref + 3 * ref_stride; + + __m128i p0 = + _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], + row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], + row3[0], row3[1], row3[2], row3[3]); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 4 * ref_stride; + } + } +} + +void aom_jnt_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} + +#define JNT_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +JNT_SUBPIX_AVG_VAR(128, 128) +JNT_SUBPIX_AVG_VAR(128, 64) +JNT_SUBPIX_AVG_VAR(64, 128) +JNT_SUBPIX_AVG_VAR(64, 64) +JNT_SUBPIX_AVG_VAR(64, 32) +JNT_SUBPIX_AVG_VAR(32, 64) +JNT_SUBPIX_AVG_VAR(32, 32) +JNT_SUBPIX_AVG_VAR(32, 16) +JNT_SUBPIX_AVG_VAR(16, 32) +JNT_SUBPIX_AVG_VAR(16, 16) +JNT_SUBPIX_AVG_VAR(16, 8) +JNT_SUBPIX_AVG_VAR(8, 16) +JNT_SUBPIX_AVG_VAR(8, 8) +JNT_SUBPIX_AVG_VAR(8, 4) +JNT_SUBPIX_AVG_VAR(4, 8) +JNT_SUBPIX_AVG_VAR(4, 4) +JNT_SUBPIX_AVG_VAR(4, 16) +JNT_SUBPIX_AVG_VAR(16, 4) +JNT_SUBPIX_AVG_VAR(8, 32) +JNT_SUBPIX_AVG_VAR(32, 8) +JNT_SUBPIX_AVG_VAR(16, 64) +JNT_SUBPIX_AVG_VAR(64, 16) diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c index bf8150e2a..18862dd3e 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c @@ -11,13 +11,14 @@ #include <immintrin.h> /* AVX2 */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" -void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void aom_lpf_horizontal_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -368,7 +369,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, +void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index 8343dbbed..f1eac233b 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -11,7 +11,9 @@ #include <emmintrin.h> // SSE2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" @@ -19,1047 +21,1016 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -#if CONFIG_PARALLEL_DEBLOCKING -// filter_mask and hev_mask -#define FILTER_HEV_MASK4 \ - do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = \ - _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask2(*limit, *blimit, */ \ - /* p1, p0, q0, q1); */ \ - abs_p0q0 = \ - _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ - abs_p1q1 = \ - _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ - } while (0) -#endif // CONFIG_PARALLEL_DEBLOCKING - -// filter_mask and hev_mask -#define FILTER_HEV_MASK \ - do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1, work; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = \ - _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask(*limit, *blimit, */ \ - /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ - abs_p0q0 = \ - _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ - abs_p1q1 = \ - _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - /* abs(p3 - p2), abs(p2 - p1) */ \ - work = abs_diff(p3p2, p2p1); \ - flat = _mm_max_epu8(work, flat); \ - /* abs(q3 - q2), abs(q2 - q1) */ \ - work = abs_diff(q3q2, q2q1); \ - flat = _mm_max_epu8(work, flat); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ - } while (0) - -#define FILTER4 \ - do { \ - const __m128i t3t4 = \ - _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ - __m128i filter, filter2filter1, work; \ - \ - ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ - qs1qs0 = _mm_xor_si128(q1q0, t80); \ - \ - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ - work = _mm_subs_epi8(ps1ps0, qs1qs0); \ - filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ - filter = _mm_and_si128(filter, mask); /* & mask */ \ - filter = _mm_unpacklo_epi64(filter, filter); \ - \ - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ - filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ - filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ - \ - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ - filter = _mm_unpacklo_epi8(filter, filter); \ - filter = _mm_srai_epi16(filter, 9); /* round */ \ - filter = _mm_packs_epi16(filter, filter); \ - filter = _mm_andnot_si128(hev, filter); \ - \ - hev = _mm_unpackhi_epi64(filter2filter1, filter); \ - filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ - \ - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ - qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ - ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ - qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ - ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ - } while (0) +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + +static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, __m128i *ps1ps0) { + const __m128i t3t4 = + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi64(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + + hev1 = _mm_unpackhi_epi64(filter2filter1, filter); + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + /* (abs(q1 - q0), abs(p1 - p0) */ + __m128i flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + + /* const int8_t mask = filter_mask2(*limit, *blimit, */ + /* p1, p0, q0, q1); */ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi64(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); -#if !CONFIG_PARALLEL_DEBLOCKING - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING - FILTER4; - -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 8); - *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0); - - *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 8); - *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0); -#else - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 -#endif + + __m128i qs1qs0, ps1ps0; + __m128i p1, p0, q0, q1; + + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); + + xx_storel_32(s - 1 * p, ps1ps0); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); + xx_storel_32(s + 0 * p, qs1qs0); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = + __m128i p1p0, q1q0; + __m128i p1, p0, q0, q1; + + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; + __m128i d0, d1, d2, d3; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); - - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); - - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); - - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); - - // Transpose 8x8 - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - p1p0 = _mm_unpacklo_epi16(q1q0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x0 = _mm_unpacklo_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - p3p2 = _mm_unpacklo_epi32(p1p0, x0); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - p1p0 = _mm_unpackhi_epi32(p1p0, x0); -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - q1q0 = _mm_unpackhi_epi16(q1q0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x2 = _mm_unpackhi_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - q3q2 = _mm_unpackhi_epi32(q1q0, x2); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - q1q0 = _mm_unpacklo_epi32(q1q0, x2); - - q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); - q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING - FILTER4; + transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 - // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 - // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 - // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 - ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); - // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 - x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); - // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); -#if !CONFIG_PARALLEL_DEBLOCKING - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); -#endif - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - - *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); -#if !CONFIG_PARALLEL_DEBLOCKING - *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); -#endif + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); } -static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num, - uint8_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x); - const __m128i hi = _mm_srli_si128(*x, 8); - *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi); -#else - _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x); - _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x)); -#endif +static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { + xx_storel_32(s - (num + 1) * p, x); + xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); } -void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); __m128i mask, hev, flat, flat2; - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; __m128i abs_p1p0; - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); + p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); + q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = abs_diff(q1p1, q0p0); + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + __m128i fe, ff, work; + abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); } - // lp filter + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); + qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); + // loopfilter done + + __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + { + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - // Filter1 >> 3 - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); - flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); - work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - store_buffer_horz_8(&q6p6, p, 6, s); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - store_buffer_horz_8(&q5p5, p, 5, s); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - store_buffer_horz_8(&q4p4, p, 4, s); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - store_buffer_horz_8(&q3p3, p, 3, s); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - store_buffer_horz_8(&q2p2, p, 2, s); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - store_buffer_horz_8(&q1p1, p, 1, s); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - store_buffer_horz_8(&q0p0, p, 0, s); + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p6, sum_q6; + __m128i sum_p3, sum_q3, res_p, res_q; + + p6_16 = _mm_unpacklo_epi8(*q6p6, zero); + p5_16 = _mm_unpacklo_epi8(*q5p5, zero); + p4_16 = _mm_unpacklo_epi8(*q4p4, zero); + p3_16 = _mm_unpacklo_epi8(*q3p3, zero); + p2_16 = _mm_unpacklo_epi8(*q2p2, zero); + p1_16 = _mm_unpacklo_epi8(*q1p1, zero); + p0_16 = _mm_unpacklo_epi8(*q0p0, zero); + q0_16 = _mm_unpackhi_epi8(*q0p0, zero); + q1_16 = _mm_unpackhi_epi8(*q1p1, zero); + q2_16 = _mm_unpackhi_epi8(*q2p2, zero); + q3_16 = _mm_unpackhi_epi8(*q3p3, zero); + q4_16 = _mm_unpackhi_epi8(*q4p4, zero); + q5_16 = _mm_unpackhi_epi8(*q5p5, zero); + q6_16 = _mm_unpackhi_epi8(*q6p6, zero); + pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), + _mm_add_epi16(p1_16, q0_16))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), + _mm_add_epi16(p0_16, q1_16))), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(p6_16, p6_16); + sum_q6 = _mm_add_epi16(q6_16, q6_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); } -} + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -static INLINE __m128i filter_add2_sub2(const __m128i *const total, - const __m128i *const a1, - const __m128i *const a2, - const __m128i *const s1, - const __m128i *const s2) { - __m128i x = _mm_add_epi16(*a1, *total); - x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); - return x; -} + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); -static INLINE __m128i filter8_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f8_lo, - const __m128i *const f8_hi) { - const __m128i f8 = - _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); - const __m128i result = _mm_and_si128(*flat, f8); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); } -static INLINE __m128i filter16_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f_lo, - const __m128i *const f_hi) { - const __m128i f = - _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); - const __m128i result = _mm_and_si128(*flat, f); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + _mm_cvtsi32_si128(*(int *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + _mm_cvtsi32_si128(*(int *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + _mm_cvtsi32_si128(*(int *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + _mm_cvtsi32_si128(*(int *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + _mm_cvtsi32_si128(*(int *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + _mm_cvtsi32_si128(*(int *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + _mm_cvtsi32_si128(*(int *)(s + 6 * p))); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + store_buffer_horz_8(q0p0, p, 0, s); + store_buffer_horz_8(q1p1, p, 1, s); + store_buffer_horz_8(q2p2, p, 2, s); + store_buffer_horz_8(q3p3, p, 3, s); + store_buffer_horz_8(q4p4, p, 4, s); + store_buffer_horz_8(q5p5, p, 5, s); } -typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput; +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; + __m128i ps1ps0, qs1qs0; -static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x, - int p, int offset, uint8_t *s) { - int i; - if (pixel_num == FOUR_PIXELS) { - for (i = 13; i >= 0; i--) { - *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]); - } - } - if (pixel_num == EIGHT_PIXELS) { - for (i = 13; i >= 0; i--) { - _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]); - } - } - if (pixel_num == SIXTEEN_PIXELS) { - for (i = 13; i >= 0; i--) { - _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]); - } - } -} + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); -static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num, - unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - __m128i mask, hev, flat, flat2; - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - - __m128i op2, op1, op0, oq0, oq1, oq2; - - __m128i max_abs_p1p0q1q0; - - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); { - const __m128i abs_p1p0 = abs_diff(p1, p0); - const __m128i abs_q1q0 = abs_diff(q1, q0); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); - __m128i abs_p0q0 = abs_diff(p0, q0); - __m128i abs_p1q1 = abs_diff(p1, q1); - __m128i work; - max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + + work = abs_diff(q2p2, q1p1); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - } + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); - { - __m128i work; - work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); - flat = _mm_max_epu8(work, max_abs_p1p0q1q0); - work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter4 + // 5 tap filter { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ff = _mm_cmpeq_epi8(t4, t4); - - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - op1 = _mm_xor_si128(p1, t80); - op0 = _mm_xor_si128(p0, t80); - oq0 = _mm_xor_si128(q0, t80); - oq1 = _mm_xor_si128(q1, t80); - - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); - - work_a = _mm_subs_epi8(oq0, op0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); - oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); - // loopfilter done - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter8 - { - const __m128i four = _mm_set1_epi16(4); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - __m128i f8_lo, f8_hi; - - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), - _mm_add_epi16(p3_lo, p2_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); - - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), - _mm_add_epi16(p3_hi, p2_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); - - op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); - op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); - op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); - oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); - oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); - oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); - const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); - const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); - const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); - const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); - const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); - const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); - - const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); - const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); - const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); - const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); - const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); - const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); - const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); - - __m128i f_lo; - __m128i f_hi; - - f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 - f_lo = - _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); - f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); - - f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 - f_hi = - _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); - f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - - __m128i x[14]; - x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - - store_buffer_horz_16(pixel_num, x, p, 6, s); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + const __m128i four = _mm_set1_epi16(4); + + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), + _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + p2_16); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_shft1 = _mm_srli_epi16(workp_a, 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), + p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_add_epi16(q1_16, q2_16); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), + p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } -void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, +void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); +} + +void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +static AOM_FORCE_INLINE void lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_a, op2, oq2; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - p0q0 = _mm_shuffle_epi32(q0p0, 78); + q3p3 = _mm_unpacklo_epi64(*p3, *q3); + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); { // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -1067,424 +1038,215 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); } + // filter8 { const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - } - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i ps1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 11); - filter1 = _mm_packs_epi16(filter1, filter1); - - // Filter2 >> 3 - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 11); - filter2 = _mm_packs_epi16(filter2, zero); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - filt = _mm_unpacklo_epi8(zero, filt); - filt = _mm_srai_epi16(filt, 9); - filt = _mm_packs_epi16(filt, zero); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_loadl_epi64((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_loadl_epi64((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_loadl_epi64((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_loadl_epi64((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_loadl_epi64((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2); - *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1); - *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0); - *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0); - *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1); - *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2); -#else - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); -#endif + + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + op2 = _mm_packus_epi16(workp_shft0, workp_shft0); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + oq2 = _mm_packus_epi16(workp_shft1, workp_shft1); } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + + work_a = _mm_andnot_si128(flat, *q2); + q2_16 = _mm_and_si128(flat, oq2); + *q2_out = _mm_or_si128(work_a, q2_16); + + work_a = _mm_andnot_si128(flat, *p2); + p2_16 = _mm_and_si128(flat, op2); + *p2_out = _mm_or_si128(work_a, p2_16); } -void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { -#if CONFIG_PARALLEL_DEBLOCKING - lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); -#else - lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); -#endif +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0, p2_out, q2_out; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p)); + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &p2_out, &q2_out, &blimit, &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); + xx_storel_32(s - 3 * p, p2_out); + xx_storel_32(s + 2 * p, q2_out); } -void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), +void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), _mm_load_si128((const __m128i *)_thresh1)); - __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); - __m128i work; + q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), + _mm_loadl_epi64((__m128i *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); - // filter_mask and hev_mask - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); + q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), + _mm_loadl_epi64((__m128i *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), + _mm_loadl_epi64((__m128i *)(s + 6 * p))); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); +} - // flat_mask4 - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - } - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - int i = 0; - - do { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - } +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0, p2_out, q2_out; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &p2_out, &q2_out, &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, @@ -1494,449 +1256,405 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1) { + __m128i p1, p0, q0, q1; + __m128i qs1qs0, ps1ps0; + + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - const __m128i zero = _mm_set1_epi16(0); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3, p2, q2, q3; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i p1, p0, q0, q1; - __m128i mask, hev, flat; -#if !CONFIG_PARALLEL_DEBLOCKING - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !CONFIG_PARALLEL_DEBLOCKING - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // filter_mask and hev_mask - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i work; -#endif // !CONFIG_PARALLEL_DEBLOCKING - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); -#if !CONFIG_PARALLEL_DEBLOCKING - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); -#endif // !CONFIG_PARALLEL_DEBLOCKING - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - // filter4 - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - } -} + __m128i l = _mm_unpacklo_epi64(blimit, limit); -static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, - int in_p, unsigned char *out, int out_p) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i x8, x9, x10, x11, x12, x13, x14, x15; - - // 2-way interleave w/hoisting of unpacks - x0 = _mm_loadl_epi64((__m128i *)in0); // 1 - x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 - x0 = _mm_unpacklo_epi8(x0, x1); // 1 - - x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 - x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 - x1 = _mm_unpacklo_epi8(x2, x3); // 2 - - x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 - x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 - x2 = _mm_unpacklo_epi8(x4, x5); // 3 - - x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 - x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 - x3 = _mm_unpacklo_epi8(x6, x7); // 4 - x4 = _mm_unpacklo_epi16(x0, x1); // 9 - - x8 = _mm_loadl_epi64((__m128i *)in1); // 2 - x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 - x8 = _mm_unpacklo_epi8(x8, x9); // 5 - x5 = _mm_unpacklo_epi16(x2, x3); // 10 - - x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 - x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 - x9 = _mm_unpacklo_epi8(x10, x11); // 6 - - x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 - x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 - x10 = _mm_unpacklo_epi8(x12, x13); // 7 - x12 = _mm_unpacklo_epi16(x8, x9); // 11 - - x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 - x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 - x11 = _mm_unpacklo_epi8(x14, x15); // 8 - x13 = _mm_unpacklo_epi16(x10, x11); // 12 - - x6 = _mm_unpacklo_epi32(x4, x5); // 13 - x7 = _mm_unpackhi_epi32(x4, x5); // 14 - x14 = _mm_unpacklo_epi32(x12, x13); // 15 - x15 = _mm_unpackhi_epi32(x12, x13); // 16 + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - // Store first 4-line result - _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - x4 = _mm_unpackhi_epi16(x0, x1); - x5 = _mm_unpackhi_epi16(x2, x3); - x12 = _mm_unpackhi_epi16(x8, x9); - x13 = _mm_unpackhi_epi16(x10, x11); + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - x6 = _mm_unpacklo_epi32(x4, x5); - x7 = _mm_unpackhi_epi32(x4, x5); - x14 = _mm_unpacklo_epi32(x12, x13); - x15 = _mm_unpackhi_epi32(x12, x13); + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); - // Store second 4-line result - _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); } -#if CONFIG_PARALLEL_DEBLOCKING -#define movq(p) _mm_loadl_epi64((const __m128i *)(p)) -#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1) -#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1) -#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1) -#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r) -#define pshufd(r, imm) _mm_shuffle_epi32(r, imm) -enum { ROTATE_DWORD_RIGHT = 0x39 }; -static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride, - const uint8_t *pSrc, - const ptrdiff_t srcStride) { - for (uint32_t idx = 0; idx < 2; idx += 1) { - __m128i r0, r1, r2, r3; - // load data - r0 = movq(pSrc); - r1 = movq(pSrc + srcStride); - r2 = movq(pSrc + srcStride * 2); - r3 = movq(pSrc + srcStride * 3); - // transpose - r0 = punpcklbw(r0, r1); - r2 = punpcklbw(r2, r3); - r1 = punpckhwd(r0, r2); - r0 = punpcklwd(r0, r2); - // store data - movd(pDst, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 2, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 3, r0); - movd(pDst + dstStride * 4, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 5, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 6, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 7, r1); - // advance the pointers - pDst += dstStride * 8; - pSrc += 8; - } -} - -#endif // CONFIG_PARALLEL_DEBLOCKING -static INLINE void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { - int idx8x8 = 0; +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i p0, q0, q1, p1; __m128i x0, x1, x2, x3, x4, x5, x6, x7; - do { - unsigned char *in = src[idx8x8]; - unsigned char *out = dst[idx8x8]; - - x0 = - _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - x1 = - _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - x0 = _mm_unpacklo_epi8(x0, x1); - - x2 = - _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - x3 = - _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(x2, x3); - - x4 = - _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - x5 = - _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(x4, x5); - - x6 = - _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - x7 = - _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(x6, x7); - - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - x4 = _mm_unpacklo_epi16(x0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x5 = _mm_unpacklo_epi16(x2, x3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0 * out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1 * out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2 * out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3 * out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi16(x0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi16(x2, x3); - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4 * out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5 * out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi32(x4, x5); - - _mm_storel_pd((double *)(out + 6 * out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7 * out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i qs1qs0, ps1ps0; -void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); -#if !CONFIG_PARALLEL_DEBLOCKING - unsigned char *src[2]; - unsigned char *dst[2]; -#endif // !CONFIG_PARALLEL_DEBLOCKING - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); - // Loop filtering - aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); -#if !CONFIG_PARALLEL_DEBLOCKING - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - transpose(src, 16, dst, p, 2); -#else // CONFIG_PARALLEL_DEBLOCKING - transpose16x4(s - 2, p, t_dst + 16 * 2, 16); -#endif // !CONFIG_PARALLEL_DEBLOCKING -} + __m128i l = _mm_unpacklo_epi64(blimit, limit); -void aom_lpf_vertical_8_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); - unsigned char *src[1]; - unsigned char *dst[1]; + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - transpose(src, p, dst, 8, 1); + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - // Loop filtering - aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + x0 = _mm_loadl_epi64((__m128i *)((s - 2))); + x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); - src[0] = t_dst; - dst[0] = s - 4; + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, + &q1); - // Transpose back - transpose(src, 8, dst, p, 1); -} + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); -void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); - unsigned char *src[2]; - unsigned char *dst[2]; + p1 = _mm_srli_si128(ps1ps0, 8); + q1 = _mm_srli_si128(qs1qs0, 8); - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, + &d5, &d6, &d7); - // Loop filtering - aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); - src[0] = t_dst; - src[1] = t_dst + 8; + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; +void aom_lpf_vertical_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x2, x1, x0, x3; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} - // Transpose back - transpose(src, 16, dst, p, 2); +void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); } -void aom_lpf_vertical_16_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); - unsigned char *src[2]; - unsigned char *dst[2]; +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + __m128i p2, p0, q0, q2; + __m128i x2, x1, x0, x3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + // Loop filtering + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2, + &q2, &blimit, &limit, &thresh); - src[0] = s - 8; - src[1] = s; - dst[0] = t_dst; - dst[1] = t_dst + 8 * 8; + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose 16x8 - transpose(src, p, dst, 8, 2); + transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1, + &d2, &d3); - // Loop filtering - aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); +} - src[0] = t_dst; - src[1] = t_dst + 8 * 8; - dst[0] = s - 8; - dst[1] = s; +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); - // Transpose back - transpose(src, 8, dst, p, 2); + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d1, d3, d5, d7; + __m128i q1q0, p1p0; + __m128i p2, p1, q1, q2; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, + &p1p0, &p2, &q2, &blimit, &limit, &thresh); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3, + &d4d5, &d6d7); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); + _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); + _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); } -void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - DECLARE_ALIGNED(16, unsigned char, t_dst[256]); - - // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); +void aom_lpf_vertical_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3, x2, x1, x0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i p0_out, p1_out, p2_out, p3_out; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p)); + + transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6, + &p7); + + x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, + &q7); + + q6p6 = _mm_unpacklo_epi64(p1, q6); + q5p5 = _mm_unpacklo_epi64(p2, q5); + q4p4 = _mm_unpacklo_epi64(p3, q4); + q3p3 = _mm_unpacklo_epi64(p4, q3); + q2p2 = _mm_unpacklo_epi64(p5, q2); + q1p1 = _mm_unpacklo_epi64(p6, q1); + q0p0 = _mm_unpacklo_epi64(p7, q0); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &p0_out, &p1_out, &p2_out, &p3_out); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2, + &q3); + + _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out); + _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out); + _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out); + _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out); + + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s + 3 * p), q3); +} - // Loop filtering - aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); +void aom_lpf_vertical_14_dual_sse2( + unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x7, x6, x5, x4, x3, x2, x1, x0; + __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; + __m128i q0, q1, q2, q3, q7; + __m128i p0p1, p2p3, p4p5, p6p7; + + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); - // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); + + transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, + &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); + + q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); + q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); + q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); + q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); + q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); + q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); + q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); + q7 = _mm_srli_si128(d14d15, 8); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, + &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); } diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h index 027c890dc..c6b6469b4 100644 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -14,117 +14,202 @@ #include <emmintrin.h> // SSE2 -#include "./aom_config.h" - -static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], - int out_p, int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; - do { - uint16_t *in = src[idx8x8]; - uint16_t *out = dst[idx8x8]; - - p0 = - _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - p1 = - _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - p2 = - _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - p3 = - _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - p4 = - _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - p5 = - _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - p6 = - _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - p7 = - _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 - x0 = _mm_unpacklo_epi16(p0, p1); - // 20 30 21 31 22 32 23 33 - x1 = _mm_unpacklo_epi16(p2, p3); - // 40 50 41 51 42 52 43 53 - x2 = _mm_unpacklo_epi16(p4, p5); - // 60 70 61 71 62 72 63 73 - x3 = _mm_unpacklo_epi16(p6, p7); - // 00 10 20 30 01 11 21 31 - x4 = _mm_unpacklo_epi32(x0, x1); - // 40 50 60 70 41 51 61 71 - x5 = _mm_unpacklo_epi32(x2, x3); - // 00 10 20 30 40 50 60 70 - x6 = _mm_unpacklo_epi64(x4, x5); - // 01 11 21 31 41 51 61 71 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); - // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); - // 01 11 21 31 41 51 61 71 - - // 02 12 22 32 03 13 23 33 - x4 = _mm_unpackhi_epi32(x0, x1); - // 42 52 62 72 43 53 63 73 - x5 = _mm_unpackhi_epi32(x2, x3); - // 02 12 22 32 42 52 62 72 - x6 = _mm_unpacklo_epi64(x4, x5); - // 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); - // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); - // 03 13 23 33 43 53 63 73 - - // 04 14 05 15 06 16 07 17 - x0 = _mm_unpackhi_epi16(p0, p1); - // 24 34 25 35 26 36 27 37 - x1 = _mm_unpackhi_epi16(p2, p3); - // 44 54 45 55 46 56 47 57 - x2 = _mm_unpackhi_epi16(p4, p5); - // 64 74 65 75 66 76 67 77 - x3 = _mm_unpackhi_epi16(p6, p7); - // 04 14 24 34 05 15 25 35 - x4 = _mm_unpacklo_epi32(x0, x1); - // 44 54 64 74 45 55 65 75 - x5 = _mm_unpacklo_epi32(x2, x3); - // 04 14 24 34 44 54 64 74 - x6 = _mm_unpacklo_epi64(x4, x5); - // 05 15 25 35 45 55 65 75 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); - // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); - // 05 15 25 35 45 55 65 75 - - // 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi32(x0, x1); - // 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi32(x2, x3); - // 06 16 26 36 46 56 66 76 - x6 = _mm_unpacklo_epi64(x4, x5); - // 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); - // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); - // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); +#include "config/aom_config.h" + +static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5) { + __m128i w0, w1, w2, w3, w4, w5, ww0; + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 + *d1 = _mm_unpackhi_epi64(ww0, + _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + *d2 = _mm_unpacklo_epi64(ww0, + _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx + + w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx + w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx + w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx + + *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 + + ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 + *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 + *d5 = _mm_unpackhi_epi64(ww0, + _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx +} + +static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i zero = _mm_setzero_si128(); + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + + *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx + *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx + *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx + *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx +} + +static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, ww2, ww3; + __m128i zero = _mm_setzero_si128(); + + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + + ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + + *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx + *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx + *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx + *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx } -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, - uint16_t *out, int out_p) { - uint16_t *src0[1]; - uint16_t *src1[1]; - uint16_t *dest0[1]; - uint16_t *dest1[1]; - src0[0] = in0; - src1[0] = in1; - dest0[0] = out; - dest1[0] = out + 8; - highbd_transpose(src0, in_p, dest0, out_p, 1); - highbd_transpose(src1, in_p, dest1, out_p, 1); +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); + highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); } + +static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 + w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose8x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); + highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); +} + +// here in and out pointers (x and d arrays) should be different! we don't store +// their values inside +static INLINE void highbd_transpose8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, + d5, d6, d7); + highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, + x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, + d4 + 1, d5 + 1, d6 + 1, d7 + 1); +} + #endif // _AOM_DSP_X86_LPF_COMMON_X86_H diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 2536f91d2..1f42eec2f 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -12,8 +12,9 @@ #include <stdio.h> #include <tmmintrin.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" @@ -75,11 +76,9 @@ static INLINE unsigned int masked_sad4xh_ssse3( ref_stride, msk, msk_stride, n); \ } -#if CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(128, 128) MASKSADMXN_SSSE3(128, 64) MASKSADMXN_SSSE3(64, 128) -#endif // CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(64, 64) MASKSADMXN_SSSE3(64, 32) MASKSADMXN_SSSE3(32, 64) @@ -93,18 +92,12 @@ MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) MASKSADMXN_SSSE3(16, 64) MASKSADMXN_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -MASKSADMXN_SSSE3(32, 128) -MASKSADMXN_SSSE3(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -239,7 +232,6 @@ static INLINE unsigned int masked_sad4xh_ssse3( return (sad + 31) >> 6; } -#if CONFIG_HIGHBITDEPTH // For width a multiple of 8 static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, @@ -277,11 +269,9 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( ref8, ref_stride, msk, msk_stride, n); \ } -#if CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(128, 128) HIGHBD_MASKSADMXN_SSSE3(128, 64) HIGHBD_MASKSADMXN_SSSE3(64, 128) -#endif // CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(64, 64) HIGHBD_MASKSADMXN_SSSE3(64, 32) HIGHBD_MASKSADMXN_SSSE3(32, 64) @@ -295,18 +285,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) HIGHBD_MASKSADMXN_SSSE3(16, 64) HIGHBD_MASKSADMXN_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_MASKSADMXN_SSSE3(32, 128) -HIGHBD_MASKSADMXN_SSSE3(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, @@ -424,5 +408,3 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; } - -#endif diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index 3ffe132be..d7dbefd7d 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -13,13 +13,15 @@ #include <string.h> #include <tmmintrin.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/blend.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" -#include "aom_ports/mem.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" // For width a multiple of 16 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, @@ -108,11 +110,9 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ } -#if CONFIG_EXT_PARTITION MASK_SUBPIX_VAR_SSSE3(128, 128) MASK_SUBPIX_VAR_SSSE3(128, 64) MASK_SUBPIX_VAR_SSSE3(64, 128) -#endif MASK_SUBPIX_VAR_SSSE3(64, 64) MASK_SUBPIX_VAR_SSSE3(64, 32) MASK_SUBPIX_VAR_SSSE3(32, 64) @@ -126,18 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8) MASK_SUBPIX_VAR8XH_SSSE3(4) MASK_SUBPIX_VAR4XH_SSSE3(8) MASK_SUBPIX_VAR4XH_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) MASK_SUBPIX_VAR_SSSE3(64, 16) MASK_SUBPIX_VAR_SSSE3(16, 64) -#if CONFIG_EXT_PARTITION -MASK_SUBPIX_VAR_SSSE3(128, 32) -MASK_SUBPIX_VAR_SSSE3(32, 128) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -523,7 +517,6 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } -#if CONFIG_HIGHBITDEPTH // For width a multiple of 8 static void highbd_bilinear_filter(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, @@ -695,11 +688,9 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) -#endif HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) @@ -713,18 +704,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32) -#endif -#endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -1040,4 +1025,40 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } -#endif +void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + assert(height % 2 == 0); + int i = 0; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, + mask + mask_stride, comp_pred + width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } else { // width == 32 + assert(width == 32); + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16); + comp_pred += (width); + src0 += (stride0); + src1 += (stride1); + mask += (mask_stride); + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h new file mode 100644 index 000000000..dc41a8342 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H +#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H + +#include <stdlib.h> +#include <string.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" + +static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, + const uint8_t *src1, + const uint8_t *mask, uint8_t *dst) { + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); + const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); + const __m128i aA = _mm_load_si128((const __m128i *)(mask)); + + const __m128i maA = _mm_sub_epi8(alpha_max, aA); + + const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); + const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); + const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); + const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); + + const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); + const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); + + const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); + const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); + _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); +} + +static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + const uint8_t *mask, + int mask_stride) { + int i = 0; + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); + // even line B + const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); + const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + // comp_pred's stride == width == 8 + _mm_store_si128((__m128i *)(comp_pred), round); + comp_pred += (8 << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h new file mode 100644 index 000000000..8b69606dd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_DSP_X86_MEM_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, + const int byte_stride) { + return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), + *(const int32_t *)((int8_t *)src + 1 * byte_stride), + *(const int32_t *)((int8_t *)src + 2 * byte_stride), + *(const int32_t *)((int8_t *)src + 3 * byte_stride)); +} + +static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, + const int byte_stride) { + __m128i dst; + dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); + dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); + return dst; +} + +#endif // AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h index 73589a32a..a3535f985 100644 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -14,7 +14,7 @@ #include <immintrin.h> -#include "./aom_config.h" +#include "config/aom_config.h" static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { v_d = _mm_hadd_epi32(v_d, v_d); diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 52dd508ec..0338a8c77 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -12,7 +12,8 @@ #include <assert.h> #include <immintrin.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_ports/mem.h" #include "aom/aom_integer.h" @@ -24,9 +25,11 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// -static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - const int height) { +static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); @@ -59,11 +62,9 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, const int width, - const int height) { +static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); @@ -119,11 +120,9 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, } \ } -#if CONFIG_EXT_PARTITION OBMCSADWXH(128, 128) OBMCSADWXH(128, 64) OBMCSADWXH(64, 128) -#endif // CONFIG_EXT_PARTITION OBMCSADWXH(64, 64) OBMCSADWXH(64, 32) OBMCSADWXH(32, 64) @@ -137,25 +136,22 @@ OBMCSADWXH(8, 8) OBMCSADWXH(8, 4) OBMCSADWXH(4, 8) OBMCSADWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) OBMCSADWXH(16, 64) OBMCSADWXH(64, 16) -#endif //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// -#if CONFIG_HIGHBITDEPTH -static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - 4; int n = 0; @@ -189,11 +185,9 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int width, const int height) { +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; @@ -250,11 +244,9 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, } \ } -#if CONFIG_EXT_PARTITION HBD_OBMCSADWXH(128, 128) HBD_OBMCSADWXH(128, 64) HBD_OBMCSADWXH(64, 128) -#endif // CONFIG_EXT_PARTITION HBD_OBMCSADWXH(64, 64) HBD_OBMCSADWXH(64, 32) HBD_OBMCSADWXH(32, 64) @@ -268,12 +260,9 @@ HBD_OBMCSADWXH(8, 8) HBD_OBMCSADWXH(8, 4) HBD_OBMCSADWXH(4, 8) HBD_OBMCSADWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) HBD_OBMCSADWXH(16, 64) HBD_OBMCSADWXH(64, 16) -#endif -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 392616af3..571aa770b 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -12,7 +12,8 @@ #include <assert.h> #include <immintrin.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_ports/mem.h" #include "aom/aom_integer.h" @@ -128,11 +129,9 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#if CONFIG_EXT_PARTITION OBMCVARWXH(128, 128) OBMCVARWXH(128, 64) OBMCVARWXH(64, 128) -#endif // CONFIG_EXT_PARTITION OBMCVARWXH(64, 64) OBMCVARWXH(64, 32) OBMCVARWXH(32, 64) @@ -146,24 +145,17 @@ OBMCVARWXH(8, 8) OBMCVARWXH(8, 4) OBMCVARWXH(4, 8) OBMCVARWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) -#if CONFIG_EXT_PARTITION -OBMCVARWXH(32, 128) -OBMCVARWXH(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// -#if CONFIG_HIGHBITDEPTH static INLINE void hbd_obmc_variance_w4( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { @@ -278,8 +270,19 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, uint64_t sse64 = 0; if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else { + } else if (w < 128 || h < 128) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + assert(w == 128 && h == 128); + + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * w; + mask += 64 * w; + h -= 64; + } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); @@ -291,28 +294,23 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; - if (w == 128) { - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128, - 32); - pre8 += 32 * pre_stride; - wsrc += 32 * 128; - mask += 32 * 128; - h -= 32; - } while (h > 0); - } else if (w == 64 && h >= 128) { - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64, - 64); - pre8 += 64 * pre_stride; - wsrc += 64 * 64; - mask += 64 * 64; - h -= 64; - } while (h > 0); - } else if (w == 4) { + int max_pel_allowed_per_ovf = 512; + if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else { + } else if (w * h <= max_pel_allowed_per_ovf) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + int h_per_ovf = max_pel_allowed_per_ovf / w; + + assert(max_pel_allowed_per_ovf % w == 0); + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + h_per_ovf); + pre8 += h_per_ovf * pre_stride; + wsrc += h_per_ovf * w; + mask += h_per_ovf * w; + h -= h_per_ovf; + } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); @@ -347,11 +345,9 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION HBD_OBMCVARWXH(128, 128) HBD_OBMCVARWXH(128, 64) HBD_OBMCVARWXH(64, 128) -#endif // CONFIG_EXT_PARTITION HBD_OBMCVARWXH(64, 64) HBD_OBMCVARWXH(64, 32) HBD_OBMCVARWXH(32, 64) @@ -365,16 +361,9 @@ HBD_OBMCVARWXH(8, 8) HBD_OBMCVARWXH(8, 4) HBD_OBMCVARWXH(4, 8) HBD_OBMCVARWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) HBD_OBMCVARWXH(16, 64) HBD_OBMCVARWXH(64, 16) -#if CONFIG_EXT_PARTITION -HBD_OBMCVARWXH(32, 128) -HBD_OBMCVARWXH(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm index 954a95b98..e6b40262d 100644 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -44,16 +44,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m0, [zbinq] ; m0 = zbin ; Get DC and first 15 AC coeffs - in this special case, that is all. -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers but we process them as 16 bit numbers mova m9, [coeffq] packssdw m9, [coeffq+16] ; m9 = c[i] mova m10, [coeffq+32] packssdw m10, [coeffq+48] ; m10 = c[i] -%else - mova m9, [coeffq] ; m9 = c[i] - mova m10, [coeffq+16] ; m10 = c[i] -%endif mov r0, eobmp ; Output pointer mov r1, qcoeffmp ; Output pointer @@ -76,15 +71,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ptest m14, m14 jnz .single_nonzero -%if CONFIG_HIGHBITDEPTH mova [r1 ], ymm5 mova [r1+32], ymm5 mova [r2 ], ymm5 mova [r2+32], ymm5 -%else - mova [r1], ymm5 - mova [r2], ymm5 -%endif mov [r0], word 0 vzeroupper @@ -124,7 +114,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -136,16 +125,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmovsxwd m11, m13 mova [qcoeffq+32], m11 mova [qcoeffq+48], m6 -%else - mova [qcoeffq ], m8 - mova [qcoeffq+16], m13 -%endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q -%if CONFIG_HIGHBITDEPTH ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -157,10 +141,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmovsxwd m11, m13 mova [dqcoeffq+32], m11 mova [dqcoeffq+48], m6 -%else - mova [dqcoeffq ], m8 - mova [dqcoeffq+16], m13 -%endif mova m6, [iscanq] ; m6 = scan[i] mova m11, [iscanq+16] ; m11 = scan[i] @@ -229,29 +209,20 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif + lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [coeffq+ncoeffq*4+ 0] packssdw m9, [coeffq+ncoeffq*4+16] mova m10, [coeffq+ncoeffq*4+32] packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -264,16 +235,10 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ ptest m14, m14 jnz .first_nonzero -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4 ], ymm5 mova [qcoeffq+ncoeffq*4+32], ymm5 mova [dqcoeffq+ncoeffq*4 ], ymm5 mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2], ymm5 - mova [dqcoeffq+ncoeffq*2], ymm5 -%endif - add ncoeffq, mmsize punpckhqdq m1, m1 @@ -302,7 +267,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -314,10 +278,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif %ifidn %1, b_32x32 pabsw m8, m8 @@ -333,7 +293,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -345,10 +304,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 @@ -363,16 +318,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ .ac_only_loop: -%if CONFIG_HIGHBITDEPTH ; pack coeff from 32bit to 16bit array mova m9, [coeffq+ncoeffq*4+ 0] packssdw m9, [coeffq+ncoeffq*4+16] mova m10, [coeffq+ncoeffq*4+32] packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -385,15 +335,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ ptest m14, m14 jnz .rest_nonzero -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4+ 0], ymm5 mova [qcoeffq+ncoeffq*4+32], ymm5 mova [dqcoeffq+ncoeffq*4+ 0], ymm5 mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2+ 0], ymm5 - mova [dqcoeffq+ncoeffq*2+ 0], ymm5 -%endif + add ncoeffq, mmsize jnz .ac_only_loop @@ -424,7 +370,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pand m14, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m14 punpckhwd m6, m14, m6 @@ -436,10 +381,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif %ifidn %1, b_32x32 pabsw m14, m14 @@ -454,7 +395,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m14 punpckhwd m6, m14, m6 @@ -466,10 +406,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 @@ -510,27 +446,16 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_HIGHBITDEPTH lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - neg ncoeffq pxor m7, m7 .blank_loop: -%if CONFIG_HIGHBITDEPTH mova [dqcoeffq+ncoeffq*4+ 0], ymm7 mova [dqcoeffq+ncoeffq*4+32], ymm7 mova [qcoeffq+ncoeffq*4+ 0], ymm7 mova [qcoeffq+ncoeffq*4+32], ymm7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], ymm7 - mova [qcoeffq+ncoeffq*2+ 0], ymm7 -%endif add ncoeffq, mmsize jl .blank_loop @@ -543,5 +468,3 @@ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob INIT_XMM avx QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 - -END diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 0e7f679d0..46b9c7d29 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -12,7 +12,8 @@ #include <emmintrin.h> #include <xmmintrin.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm index 36b4dddbd..e2c1ebb71 100644 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -45,7 +45,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] + psubw m0, [GLOBAL(pw_1)] mov r2, shiftmp mov r3, qcoeffmp mova m4, [r2] ; m4 = shift @@ -56,29 +56,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_HIGHBITDEPTH lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -99,7 +88,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 @@ -117,10 +106,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif + %ifidn %1, b_32x32 pabsw m8, m8 pabsw m13, m13 @@ -134,7 +120,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m8, m9 psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 @@ -152,10 +137,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -169,16 +150,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: -%if CONFIG_HIGHBITDEPTH ; pack coeff from 32bit to 16bit array mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif + pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -201,7 +178,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pxor m11, m11 mova m11, m14 @@ -220,10 +196,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif + %ifidn %1, b_32x32 pabsw m14, m14 pabsw m13, m13 @@ -236,7 +209,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m14 mova m6, m14 @@ -254,10 +227,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif + pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -274,7 +244,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4+ 0], m5 mova [qcoeffq+ncoeffq*4+16], m5 mova [qcoeffq+ncoeffq*4+32], m5 @@ -283,12 +252,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+16], m5 mova [dqcoeffq+ncoeffq*4+32], m5 mova [dqcoeffq+ncoeffq*4+48], m5 -%else - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 -%endif add ncoeffq, mmsize jl .ac_only_loop %endif @@ -313,17 +276,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mov r2, qcoeffmp mov r3, eobmp DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_HIGHBITDEPTH lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif neg ncoeffq pxor m7, m7 .blank_loop: -%if CONFIG_HIGHBITDEPTH mova [dqcoeffq+ncoeffq*4+ 0], m7 mova [dqcoeffq+ncoeffq*4+16], m7 mova [dqcoeffq+ncoeffq*4+32], m7 @@ -332,12 +289,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+16], m7 mova [qcoeffq+ncoeffq*4+32], m7 mova [qcoeffq+ncoeffq*4+48], m7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 -%endif add ncoeffq, mmsize jl .blank_loop mov word [eobq], 0 diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c index e60f518b4..f662b62b1 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c @@ -9,7 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <immintrin.h> // AVX2 -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 2c67f450f..55a856985 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -233,11 +233,9 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ %endmacro INIT_XMM sse2 -%if CONFIG_EXT_PARTITION SADNXN4D 128, 128 SADNXN4D 128, 64 SADNXN4D 64, 128 -%endif SADNXN4D 64, 64 SADNXN4D 64, 32 SADNXN4D 32, 64 @@ -251,11 +249,9 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 -%if CONFIG_EXT_PARTITION_TYPES SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 SADNXN4D 16, 64 SADNXN4D 64, 16 -%endif diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c index efba61289..a50dba64a 100644 --- a/third_party/aom/aom_dsp/x86/sad_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_avx2.c @@ -9,7 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" #define FSAD64_H(h) \ diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c index e8dd87a26..b506d4663 100644 --- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c @@ -11,10 +11,11 @@ #include <immintrin.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/mem.h" // SAD @@ -360,7 +361,6 @@ unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, return sum; } -#if CONFIG_EXT_PARTITION static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[8], r[8]; @@ -471,7 +471,6 @@ unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); return sum; } -#endif // CONFIG_EXT_PARTITION // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, @@ -649,7 +648,6 @@ unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, return sum; } -#if CONFIG_EXT_PARTITION unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { @@ -697,19 +695,13 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, second_pred); return sum; } -#endif // CONFIG_EXT_PARTITION // SAD 4D // Combine 4 __m256i vectors to uint32_t result[4] static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; -#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0, - UINT32_MAX, 0, UINT32_MAX, 0); -#else - const __m256i mask = _mm256_set1_epi64x(UINT32_MAX); -#endif + const __m256i mask = yy_set1_64_from_32i(UINT32_MAX); __m128i sad; // 8 32-bit summation @@ -967,7 +959,6 @@ void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, sad_array[3] = first_half[3] + second_half[3]; } -#if CONFIG_EXT_PARTITION void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) { @@ -1045,4 +1036,3 @@ void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, sad_array[2] = first_half[2] + second_half[2]; sad_array[3] = first_half[3] + second_half[3]; } -#endif // CONFIG_EXT_PARTITION diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c index 4419c65b2..c6fd62c9e 100644 --- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c @@ -10,7 +10,8 @@ */ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index b4cc6abf1..3251b7655 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -47,7 +47,6 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ %endif ; %3 == 7 %endmacro -%if CONFIG_EXT_PARTITION ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 @@ -114,7 +113,6 @@ SAD128XN 128 ; sad128x128_sse2 SAD128XN 128, 1 ; sad128x128_avg_sse2 SAD128XN 64 ; sad128x64_sse2 SAD128XN 64, 1 ; sad128x64_avg_sse2 -%endif ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, @@ -155,18 +153,14 @@ SAD128XN 64, 1 ; sad128x64_avg_sse2 %endmacro INIT_XMM sse2 -%if CONFIG_EXT_PARTITION SAD64XN 128 ; sad64x128_sse2 SAD64XN 128, 1 ; sad64x128_avg_sse2 -%endif SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD64XN 16 ; sad64x16_sse2 SAD64XN 16, 1 ; sad64x16_avg_sse2 -%endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -212,10 +206,8 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD32XN 8 ; sad_32x8_sse2 SAD32XN 8, 1 ; sad_32x8_avg_sse2 -%endif ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -262,12 +254,10 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 SAD16XN 64 ; sad_16x64_sse2 SAD16XN 64, 1 ; sad_16x64_avg_sse2 -%endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -312,10 +302,8 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD8XN 32 ; sad_8x32_sse2 SAD8XN 32, 1 ; sad_8x32_avg_sse2 -%endif ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -361,7 +349,5 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse -%if CONFIG_EXT_PARTITION_TYPES SAD4XN 16 ; sad_4x16_sse2 SAD4XN 16, 1 ; sad_4x16_avg_sse2 -%endif diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm deleted file mode 100644 index f6c27c855..000000000 --- a/third_party/aom/aom_dsp/x86/sad_sse3.asm +++ /dev/null @@ -1,377 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBAOM_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBAOM_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -;void int aom_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x16x3_sse3) PRIVATE -sym(aom_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x8x3_sse3) PRIVATE -sym(aom_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad8x16x3_sse3) PRIVATE -sym(aom_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad8x8x3_sse3) PRIVATE -sym(aom_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad4x4x3_sse3) PRIVATE -sym(aom_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm deleted file mode 100644 index 5e9c75845..000000000 --- a/third_party/aom/aom_dsp/x86/sad_sse4.asm +++ /dev/null @@ -1,362 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -;void aom_sad16x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -global sym(aom_sad16x16x8_sse4_1) PRIVATE -sym(aom_sad16x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad16x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad16x8x8_sse4_1) PRIVATE -sym(aom_sad16x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad8x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad8x8x8_sse4_1) PRIVATE -sym(aom_sad8x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad8x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad8x16x8_sse4_1) PRIVATE -sym(aom_sad8x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad4x4x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad4x4x8_sse4_1) PRIVATE -sym(aom_sad4x4x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm deleted file mode 100644 index 96b64b040..000000000 --- a/third_party/aom/aom_dsp/x86/sad_ssse3.asm +++ /dev/null @@ -1,373 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -;void int aom_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x16x3_ssse3) PRIVATE -sym(aom_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .aom_sad16x16x3_ssse3_skiptable -.aom_sad16x16x3_ssse3_jumptable: - dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump -.aom_sad16x16x3_ssse3_skiptable: - - call .aom_sad16x16x3_ssse3_do_jump -.aom_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 - -.aom_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.aom_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int aom_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x8x3_ssse3) PRIVATE -sym(aom_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .aom_sad16x8x3_ssse3_skiptable -.aom_sad16x8x3_ssse3_jumptable: - dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump -.aom_sad16x8x3_ssse3_skiptable: - - call .aom_sad16x8x3_ssse3_do_jump -.aom_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 - -.aom_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.aom_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm index aa70106c8..6d9b5a12f 100644 --- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm @@ -47,6 +47,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm index d3feb7ec0..45bf6ec3c 100644 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm @@ -117,27 +117,26 @@ SECTION .text ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -155,9 +154,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse, g_bilin_filter, g_pw_8 %define block_height heightd ;Store bilin_filter and pw_8 location in stack @@ -176,25 +175,18 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %define block_height heightd %endif - %define bilin_filter bilin_filter_m %endif %endif @@ -374,8 +366,8 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -383,7 +375,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -400,7 +392,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -697,8 +689,8 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -706,7 +698,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -723,7 +715,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -855,8 +847,8 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -864,7 +856,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -881,7 +873,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -997,8 +989,8 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -1006,7 +998,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -1023,7 +1015,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1195,8 +1187,8 @@ SECTION .text STORE_AND_RET %1 .x_nonhalf_y_nonhalf: -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift @@ -1209,7 +1201,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -1237,7 +1229,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm index 7bd5b23ad..1a75a234f 100644 --- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm +++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm @@ -34,10 +34,8 @@ cglobal subtract_block, 7, 7, 8, \ je .case_16 cmp colsd, 32 je .case_32 -%if CONFIG_EXT_PARTITION cmp colsd, 64 je .case_64 -%endif %macro loop16 6 mova m0, [srcq+%1] @@ -62,7 +60,6 @@ cglobal subtract_block, 7, 7, 8, \ mova [diffq+mmsize*1+%6], m1 %endmacro -%if CONFIG_EXT_PARTITION mov pred_str, pred_stridemp .loop_128: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize @@ -77,7 +74,6 @@ cglobal subtract_block, 7, 7, 8, \ RET .case_64: -%endif mov pred_str, pred_stridemp .loop_64: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c index 6be99fbca..a79f22d79 100644 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -14,35 +14,62 @@ #include <stdio.h> #include "aom_dsp/x86/synonyms.h" +#include "config/aom_dsp_rtcd.h" -#include "./aom_dsp_rtcd.h" +static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { + const __m128d ad = _mm_castsi128_pd(a); + return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); +} + +static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(a); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, a); + return tmp; + } +#endif +} + +static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + + return _mm_add_epi32(v_sq_01_d, v_sq_23_d); +} static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { - const __m128i v_val_0_w = - _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); - const __m128i v_val_1_w = - _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); - const __m128i v_val_2_w = - _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); - const __m128i v_val_3_w = - _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - const __m128i v_sum_d = + const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); + __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - + v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } +static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { + int r = 0; + __m128i v_acc_q = _mm_setzero_si128(); + do { + const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); + v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); + src += stride << 2; + r += 4; + } while (r < height); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), + _mm_and_si128(v_acc_q, v_zext_mask_q)); + v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); + return xx_cvtsi128_si64(v_acc_64); +} + #ifdef __GNUC__ // This prevents GCC/Clang from inlining this function into // aom_sum_squares_2d_i16_sse2, which in turn saves some stack @@ -52,72 +79,45 @@ __attribute__((noinline)) static uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { - int r, c; + int r = 0; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc_q = _mm_setzero_si128(); - for (r = 0; r < height; r += 8) { + do { __m128i v_acc_d = _mm_setzero_si128(); - - for (c = 0; c < width; c += 8) { + int c = 0; + do { const int16_t *b = src + c; - const __m128i v_val_0_w = - _mm_load_si128((const __m128i *)(b + 0 * stride)); - const __m128i v_val_1_w = - _mm_load_si128((const __m128i *)(b + 1 * stride)); - const __m128i v_val_2_w = - _mm_load_si128((const __m128i *)(b + 2 * stride)); - const __m128i v_val_3_w = - _mm_load_si128((const __m128i *)(b + 3 * stride)); - const __m128i v_val_4_w = - _mm_load_si128((const __m128i *)(b + 4 * stride)); - const __m128i v_val_5_w = - _mm_load_si128((const __m128i *)(b + 5 * stride)); - const __m128i v_val_6_w = - _mm_load_si128((const __m128i *)(b + 6 * stride)); - const __m128i v_val_7_w = - _mm_load_si128((const __m128i *)(b + 7 * stride)); + const __m128i v_val_0_w = xx_load_128(b + 0 * stride); + const __m128i v_val_1_w = xx_load_128(b + 1 * stride); + const __m128i v_val_2_w = xx_load_128(b + 2 * stride); + const __m128i v_val_3_w = xx_load_128(b + 3 * stride); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); - } + c += 8; + } while (c < width); v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - src += 8 * stride; - } + src += 4 * stride; + r += 4; + } while (r < height); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc_q); - return tmp; - } -#endif + return xx_cvtsi128_si64(v_acc_q); } uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, @@ -127,7 +127,9 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, // are with size == 4, so it is also the common case. if (LIKELY(width == 4 && height == 4)) { return aom_sum_squares_2d_i16_4x4_sse2(src, stride); - } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) { + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); } else { @@ -140,7 +142,7 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, ////////////////////////////////////////////////////////////////////////////// static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); @@ -185,16 +187,7 @@ static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); - return tmp; - } -#endif + return xx_cvtsi128_si64(v_acc0_q); } uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index cd049a454..d9a53fcc5 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -14,7 +14,8 @@ #include <immintrin.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" /** @@ -58,6 +59,28 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) { _mm_storeu_si128((__m128i *)a, v); } +// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set_epi64x() +// acting on 32-bit integers. +static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, e1, 0, e0); +#else + return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); +#endif +} + +// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m128i xx_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, a, 0, a); +#else + return _mm_set1_epi64x((uint32_t)a); +#endif +} + static INLINE __m128i xx_round_epu16(__m128i v_val_w) { return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); } @@ -89,4 +112,12 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } +static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); + const __m128i v_tmp_d = + _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + #endif // AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h new file mode 100644 index 000000000..39f371fc9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_DSP_X86_SYNONYMS_AVX2_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m256i yy_load_256(const void *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE __m256i yy_loadu_256(const void *a) { + return _mm256_loadu_si256((const __m256i *)a); +} + +static INLINE void yy_store_256(void *const a, const __m256i v) { + _mm256_store_si256((__m256i *)a, v); +} + +static INLINE void yy_storeu_256(void *const a, const __m256i v) { + _mm256_storeu_si256((__m256i *)a, v); +} + +// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm256_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m256i yy_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); +#else + return _mm256_set1_epi64x((uint32_t)a); +#endif +} + +// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We +// therefore define an equivalent function using a different intrinsic. +// ([ hi ], [ lo ]) -> [ hi ][ lo ] +static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h new file mode 100644 index 000000000..f88a1527d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_DSP_X86_TRANSPOSE_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h deleted file mode 100644 index 1a8fed710..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H -#define AOM_DSP_X86_TXFM_COMMON_AVX2_H - -#include <immintrin.h> - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/common_avx2.h" - -#define pair256_set_epi16(a, b) \ - _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) - -#define pair256_set_epi32(a, b) \ - _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ - (int)(b), (int)(a)) - -static INLINE void mm256_reverse_epi16(__m256i *u) { - const __m256i control = _mm256_set_epi16( - 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, - 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); - __m256i v = _mm256_shuffle_epi8(*u, control); - *u = _mm256_permute2x128_si256(v, v, 1); -} - -static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, - const __m256i *cospi) { - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i y0 = _mm256_madd_epi16(*a0, *cospi); - __m256i y1 = _mm256_madd_epi16(*a1, *cospi); - - y0 = _mm256_add_epi32(y0, dct_rounding); - y1 = _mm256_add_epi32(y1, dct_rounding); - y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); - y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); - - return _mm256_packs_epi32(y0, y1); -} - -static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); - const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i u0, u1; - int i = 0; - - while (i < 16) { - in[i] = _mm256_slli_epi16(in[i], 1); - - u0 = _mm256_unpacklo_epi16(zero, in[i]); - u1 = _mm256_unpackhi_epi16(zero, in[i]); - - u0 = _mm256_madd_epi16(u0, sqrt2_epi16); - u1 = _mm256_madd_epi16(u1, sqrt2_epi16); - - u0 = _mm256_add_epi32(u0, dct_const_rounding); - u1 = _mm256_add_epi32(u1, dct_const_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - in[i] = _mm256_packs_epi32(u0, u1); - i++; - } -} - -#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h deleted file mode 100644 index 4e6eecd32..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ -#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ - -// Note: -// This header file should be put below any x86 intrinsics head file - -static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { - if (sizeof(tran_low_t) == 4) { - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); - } else { - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); - } -} - -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h index 4257d8b9c..58a792424 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -16,17 +16,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" -#define pair_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) - -#define dual_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ - (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) - -#define octa_set_epi16(a, b, c, d, e, f, g, h) \ - _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ - (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) +#define pair_set_epi16(a, b) \ + _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) // Reverse the 8 16 bit words in __m128i static INLINE __m128i mm_reverse_epi16(const __m128i x) { @@ -35,292 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } -#if CONFIG_EXT_TX -// Identity transform (both forward and inverse). -static INLINE void idtx16_8col(__m128i *in) { - const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); - const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i y0, y1, y2, y3, y4, y5, y6, y7; - - in[0] = _mm_slli_epi16(in[0], 1); - in[1] = _mm_slli_epi16(in[1], 1); - in[2] = _mm_slli_epi16(in[2], 1); - in[3] = _mm_slli_epi16(in[3], 1); - in[4] = _mm_slli_epi16(in[4], 1); - in[5] = _mm_slli_epi16(in[5], 1); - in[6] = _mm_slli_epi16(in[6], 1); - in[7] = _mm_slli_epi16(in[7], 1); - in[8] = _mm_slli_epi16(in[8], 1); - in[9] = _mm_slli_epi16(in[9], 1); - in[10] = _mm_slli_epi16(in[10], 1); - in[11] = _mm_slli_epi16(in[11], 1); - in[12] = _mm_slli_epi16(in[12], 1); - in[13] = _mm_slli_epi16(in[13], 1); - in[14] = _mm_slli_epi16(in[14], 1); - in[15] = _mm_slli_epi16(in[15], 1); - - v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); - v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); - v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); - v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); - v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16); - v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16); - v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16); - v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16); - - u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16); - u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16); - u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16); - u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16); - u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16); - u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16); - u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16); - u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16); - - x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16); - x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16); - x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16); - x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16); - x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16); - x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16); - x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16); - x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16); - - y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16); - y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16); - y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16); - y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16); - y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16); - y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16); - y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16); - y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16); - - v0 = _mm_madd_epi16(v0, k__sqrt2_epi16); - v1 = _mm_madd_epi16(v1, k__sqrt2_epi16); - v2 = _mm_madd_epi16(v2, k__sqrt2_epi16); - v3 = _mm_madd_epi16(v3, k__sqrt2_epi16); - v4 = _mm_madd_epi16(v4, k__sqrt2_epi16); - v5 = _mm_madd_epi16(v5, k__sqrt2_epi16); - v6 = _mm_madd_epi16(v6, k__sqrt2_epi16); - v7 = _mm_madd_epi16(v7, k__sqrt2_epi16); - - x0 = _mm_madd_epi16(x0, k__sqrt2_epi16); - x1 = _mm_madd_epi16(x1, k__sqrt2_epi16); - x2 = _mm_madd_epi16(x2, k__sqrt2_epi16); - x3 = _mm_madd_epi16(x3, k__sqrt2_epi16); - x4 = _mm_madd_epi16(x4, k__sqrt2_epi16); - x5 = _mm_madd_epi16(x5, k__sqrt2_epi16); - x6 = _mm_madd_epi16(x6, k__sqrt2_epi16); - x7 = _mm_madd_epi16(x7, k__sqrt2_epi16); - - u0 = _mm_madd_epi16(u0, k__sqrt2_epi16); - u1 = _mm_madd_epi16(u1, k__sqrt2_epi16); - u2 = _mm_madd_epi16(u2, k__sqrt2_epi16); - u3 = _mm_madd_epi16(u3, k__sqrt2_epi16); - u4 = _mm_madd_epi16(u4, k__sqrt2_epi16); - u5 = _mm_madd_epi16(u5, k__sqrt2_epi16); - u6 = _mm_madd_epi16(u6, k__sqrt2_epi16); - u7 = _mm_madd_epi16(u7, k__sqrt2_epi16); - - y0 = _mm_madd_epi16(y0, k__sqrt2_epi16); - y1 = _mm_madd_epi16(y1, k__sqrt2_epi16); - y2 = _mm_madd_epi16(y2, k__sqrt2_epi16); - y3 = _mm_madd_epi16(y3, k__sqrt2_epi16); - y4 = _mm_madd_epi16(y4, k__sqrt2_epi16); - y5 = _mm_madd_epi16(y5, k__sqrt2_epi16); - y6 = _mm_madd_epi16(y6, k__sqrt2_epi16); - y7 = _mm_madd_epi16(y7, k__sqrt2_epi16); - - v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); - x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); - x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); - x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); - x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING); - x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING); - x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING); - x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING); - - u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); - y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); - y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); - y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); - y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING); - y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING); - y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING); - y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - x0 = _mm_srai_epi32(x0, DCT_CONST_BITS); - x1 = _mm_srai_epi32(x1, DCT_CONST_BITS); - x2 = _mm_srai_epi32(x2, DCT_CONST_BITS); - x3 = _mm_srai_epi32(x3, DCT_CONST_BITS); - x4 = _mm_srai_epi32(x4, DCT_CONST_BITS); - x5 = _mm_srai_epi32(x5, DCT_CONST_BITS); - x6 = _mm_srai_epi32(x6, DCT_CONST_BITS); - x7 = _mm_srai_epi32(x7, DCT_CONST_BITS); - - u0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - y0 = _mm_srai_epi32(y0, DCT_CONST_BITS); - y1 = _mm_srai_epi32(y1, DCT_CONST_BITS); - y2 = _mm_srai_epi32(y2, DCT_CONST_BITS); - y3 = _mm_srai_epi32(y3, DCT_CONST_BITS); - y4 = _mm_srai_epi32(y4, DCT_CONST_BITS); - y5 = _mm_srai_epi32(y5, DCT_CONST_BITS); - y6 = _mm_srai_epi32(y6, DCT_CONST_BITS); - y7 = _mm_srai_epi32(y7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(v0, x0); - in[1] = _mm_packs_epi32(v1, x1); - in[2] = _mm_packs_epi32(v2, x2); - in[3] = _mm_packs_epi32(v3, x3); - in[4] = _mm_packs_epi32(v4, x4); - in[5] = _mm_packs_epi32(v5, x5); - in[6] = _mm_packs_epi32(v6, x6); - in[7] = _mm_packs_epi32(v7, x7); - - in[8] = _mm_packs_epi32(u0, y0); - in[9] = _mm_packs_epi32(u1, y1); - in[10] = _mm_packs_epi32(u2, y2); - in[11] = _mm_packs_epi32(u3, y3); - in[12] = _mm_packs_epi32(u4, y4); - in[13] = _mm_packs_epi32(u5, y5); - in[14] = _mm_packs_epi32(u6, y6); - in[15] = _mm_packs_epi32(u7, y7); -} -#endif // CONFIG_EXT_TX - -static INLINE void scale_sqrt2_8x4(__m128i *in) { - // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32 - // consecutive elements. - const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); - - const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); - const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); - const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); - const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); - const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); - const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); - const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); - const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); - - const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); - - in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); - in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); - in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); - in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); -} - -static INLINE void scale_sqrt2_8x8(__m128i *in) { - // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)' - // for each element. - const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); - - const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); - const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); - const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); - const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); - const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); - const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); - const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); - const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); - const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w); - const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w); - const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w); - const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w); - const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w); - const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w); - const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w); - const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w); - - const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w); - const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w); - const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w); - const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w); - const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w); - const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w); - const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w); - const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w); - - in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); - in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); - in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); - in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); - in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS)); - in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS)); - in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS)); - in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS)); -} - -static INLINE void scale_sqrt2_8x16(__m128i *in) { - scale_sqrt2_8x8(in); - scale_sqrt2_8x8(in + 8); -} - #endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 18a70dffe..7d6b7d287 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -10,109 +10,224 @@ */ #include <immintrin.h> -#include "./aom_dsp_rtcd.h" - -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void aom_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum); - -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int w, int h, - unsigned int *sse, int *sum, get_var_avx2 var_fn, - int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" + +static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { + return _mm_add_epi16(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); } -unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - aom_get16x16var_avx2, 16); +static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { + return _mm_add_epi32(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); - variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); - _mm256_zeroupper(); - return variance; + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); } -unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - _mm256_zeroupper(); - return *sse; +static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + return _mm_extract_epi32(res, 1); +} + +// handle pixels (<= 512) +static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); + return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); +} + +// handle 1024 pixels (32x32, 16x64, 64x16) +static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = + _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +// handle 2048 pixels (32x64, 64x32) +static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + vsum = sum_to_32bit_avx2(vsum); + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); + return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } } -unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); + variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - aom_get32x32var_avx2, 32); +#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512); +AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512); +AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512); +AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512); +AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11); - _mm256_zeroupper(); - return variance; +AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048); + +#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum = _mm256_setzero_si256(); \ + for (int i = 0; i < (bh / uh); i++) { \ + __m256i vsum16; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ + src += uh * src_stride; \ + ref += uh * ref_stride; \ + } \ + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ + const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32) +AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32) +AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16) +AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16) + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); + return *sse; } unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, @@ -125,68 +240,164 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, int height, unsigned int *sseptr); -unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - unsigned int sse1; - const int se1 = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); - unsigned int sse2; - const int se2 = - aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; - *sse = sse1 + sse2; - - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; -} - -unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - const int se = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; -} - -unsigned int aom_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - unsigned int sse1; - const int se1 = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); - unsigned int sse2; - const int se2 = aom_sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } - *sse = sse1 + sse2; +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; +static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } -unsigned int aom_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - // Process 32 elements in parallel. - const int se = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; +static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, + const __m256i a, + uint8_t *comp_pred) { + const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; + const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); + + const __m256i ma = _mm256_sub_epi8(alpha_max, a); + + const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); + const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); + const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); + const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); + + const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); + const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); + const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); + const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); + + const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); + _mm256_storeu_si256((__m256i *)(comp_pred), roundA); +} + +void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i = 0; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aA = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aB = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + // comp_pred's stride == width == 16 + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (16 << 2); + i += 4; + } while (i < height); + } else { // for width == 32 + do { + const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0)); + const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1)); + const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask)); + + const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0)); + const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1)); + const __m256i aB = + _mm256_lddqu_si256((const __m256i *)(mask + mask_stride)); + + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (32 << 1); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } } diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c index 999b541e3..88e27aef3 100644 --- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -11,7 +11,8 @@ #include <immintrin.h> // AVX2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" /* clang-format off */ @@ -35,203 +36,6 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { }; /* clang-format on */ -void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); - src = _mm256_inserti128_si256( - src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); - - ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); - ref = _mm256_inserti128_si256( - ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); - - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = - _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - expand_sum_high = - _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = - _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd_high = - _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = - _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = - _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); - - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - - *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - - *((int *)Sum) = _mm_cvtsi128_si32(sum_res); - } - _mm256_zeroupper(); -} - -void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - - ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - - // extract the low lane and the high lane and add the results - *((int *)SSE) = - _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - - *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } - _mm256_zeroupper(); -} - #define FILTER_SRC(filter) \ /* filter the source */ \ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 211fad3f8..c8c90a7dc 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -12,24 +12,24 @@ #include <assert.h> #include <emmintrin.h> // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" -#include "./av1_rtcd.h" #include "av1/common/filter.h" - -typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" unsigned int aom_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); + const __m128i v = xx_loadu_128(src); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); src += 8; } @@ -39,276 +39,265 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) { return _mm_cvtsi128_si32(vsum); } -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8( \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); + return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); +} -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); +static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); + return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); +} - // sse - vsum = - _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); +// Accumulate 4 32bit numbers in val to 1 32bit number +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return _mm_cvtsi128_si32(val); } -void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Accumulate 8 16bit in sum to 4 32bit number +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } +static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src, ref); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +// diff sum of 128 pixels can still fit in 16bit integer +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); } -void aom_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = add32x4_sse2(vsum); +} - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - src += src_stride; - ref += ref_stride; - } + vsum = sum_to_32bit_sse2(vsum); + *sum = add32x4_sse2(vsum); +} - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = - (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); +static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 256); // May overflow for larger height. + *sum = _mm_setzero_si128(); - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} + for (int i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src, src_stride); + const __m128i r = load4x2_sse2(ref, ref_stride); -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, int w, - int h, unsigned int *sse, int *sum, - getNxMvar_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } + variance_kernel_sse2(s, r, sse, sum); + src += 2 * src_stride; + ref += 2 * ref_stride; } } -unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 4 * 4); - assert(sum >= -255 * 4 * 4); - return *sse - ((sum * sum) >> 4); -} - -unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 8 * 4); - assert(sum >= -255 * 8 * 4); - return *sse - ((sum * sum) >> 5); +static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 128); // May overflow for larger height. + *sum = _mm_setzero_si128(); + for (int i = 0; i < h; i++) { + const __m128i s = load8_8to16_sse2(src); + const __m128i r = load8_8to16_sse2(ref); + + variance_kernel_sse2(s, r, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 8 * 4); - assert(sum >= -255 * 8 * 4); - return *sse - ((sum * sum) >> 5); +static INLINE void variance16_kernel_sse2(const uint8_t *const src, + const uint8_t *const ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); } -unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 8 * 8); - assert(sum >= -255 * 8 * 8); - return *sse - ((sum * sum) >> 6); -} +static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 64); // May overflow for larger height. + *sum = _mm_setzero_si128(); -unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 16 * 8); - assert(sum >= -255 * 16 * 8); - return *sse - ((sum * sum) >> 7); + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src, ref, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 16 * 8); - assert(sum >= -255 * 16 * 8); - return *sse - ((sum * sum) >> 7); +static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 16 * 16); - assert(sum >= -255 * 16 * 16); - return *sse - ((uint32_t)((int64_t)sum * sum) >> 8); +static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 16); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + variance16_kernel_sse2(src + 32, ref + 32, sse, sum); + variance16_kernel_sse2(src + 48, ref + 48, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 32); - assert(sum >= -255 * 32 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 8); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < 4; ++j) { + const int offset0 = j << 5; + const int offset1 = offset0 + 16; + variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); + variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); + } + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 16); - assert(sum >= -255 * 32 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); -} +#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum; \ + int sum = 0; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } -unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 16); - assert(sum >= -255 * 32 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); -} +AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128); +AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128); +AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128); + +AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128); +AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128); +AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128); +AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256); + +AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128); +AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128); +AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256); +AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256); +AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024); + +#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum = _mm_setzero_si128(); \ + for (int i = 0; i < (bh / uh); ++i) { \ + __m128i vsum16; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ + src += (src_stride * uh); \ + ref += (ref_stride * uh); \ + } \ + *sse = add32x4_sse2(vsse); \ + int sum = add32x4_sse2(vsum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } -unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 64); - assert(sum >= -255 * 64 * 64); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); -} +AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 ) -unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 32); - assert(sum >= -255 * 64 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} +AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024); +AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 ) +AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 ) +AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 ) -unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 32); - assert(sum >= -255 * 64 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} +AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 ) +AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 ) unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, @@ -338,74 +327,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, return *sse; } -#if CONFIG_EXT_PARTITION_TYPES -unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 4 * 16); - assert(sum >= -255 * 4 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); -} - -unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 16 * 4); - assert(sum >= -255 * 16 * 4); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); -} - -unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 8 * 32); - assert(sum >= -255 * 8 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); -} - -unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 32 * 8); - assert(sum >= -255 * 32 * 8); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); -} - -unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 16 * 64); - assert(sum >= -255 * 16 * 64); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} - -unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 16); - assert(sum >= -255 * 64 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} -#endif - // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm #define DECL(w, opt) \ @@ -423,75 +344,57 @@ DECLS(ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#if CONFIG_EXT_PARTITION_TYPES -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ - FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ - FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) -#endif +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); FNS(ssse3); @@ -516,76 +419,61 @@ DECLS(ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#if CONFIG_EXT_PARTITION_TYPES -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ - FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ - FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) -#endif +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); FNS(ssse3); @@ -593,9 +481,97 @@ FNS(ssse3); #undef FNS #undef FN -void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { int i; @@ -604,8 +580,7 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, for (i = 0; i < height; i++) { int j; for (j = 0; j < width; j += 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - _mm_storeu_si128((__m128i *)comp_pred, s0); + xx_storeu_128(comp_pred, xx_loadu_128(ref)); comp_pred += 16; ref += 16; } @@ -617,10 +592,9 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, assert(!(height & 1)); /*Read 8 pixels two rows at a time.*/ for (i = 0; i < height; i += 2) { - __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); - __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); - __m128i t0 = _mm_unpacklo_epi64(s0, s1); - _mm_storeu_si128((__m128i *)comp_pred, t0); + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); comp_pred += 16; ref += 2 * ref_stride; } @@ -630,69 +604,62 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, assert(!(height & 3)); /*Read 4 pixels four rows at a time.*/ for (i = 0; i < height; i++) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride)); - __m128i s2 = - _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride)); - __m128i s3 = - _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride)); - __m128i t0 = _mm_unpacklo_epi32(s0, s1); - __m128i t1 = _mm_unpacklo_epi32(s2, s3); - __m128i u0 = _mm_unpacklo_epi64(t0, t1); - _mm_storeu_si128((__m128i *)comp_pred, u0); + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); comp_pred += 16; ref += 4 * ref_stride; } } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, - -1, width, height); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, - 16, width, height); - } else { - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); - } + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, + temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } -void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride) { +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride) { int n; int i; - aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, - ref_stride); + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; for (i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0)); + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); comp_pred += 16; pred += 16; } |