From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- media/libvpx/vp9/common/x86/convolve.h | 296 ++ media/libvpx/vp9/common/x86/vp9_asm_stubs.c | 162 + media/libvpx/vp9/common/x86/vp9_copy_sse2.asm | 156 + .../vp9/common/x86/vp9_high_intrapred_sse2.asm | 476 +++ .../common/x86/vp9_high_loopfilter_intrin_sse2.c | 1215 ++++++ .../vp9/common/x86/vp9_high_subpixel_8t_sse2.asm | 962 +++++ .../common/x86/vp9_high_subpixel_bilinear_sse2.asm | 494 +++ media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c | 4223 ++++++++++++++++++++ media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h | 174 + .../vp9/common/x86/vp9_idct_ssse3_x86_64.asm | 300 ++ media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm | 667 ++++ .../libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm | 1036 +++++ .../vp9/common/x86/vp9_loopfilter_intrin_avx2.c | 986 +++++ .../vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 1580 ++++++++ media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm | 611 +++ media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm | 287 ++ media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm | 694 ++++ .../vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c | 602 +++ .../vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c | 601 +++ .../libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm | 987 +++++ .../vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 1071 +++++ .../vp9/common/x86/vp9_subpixel_bilinear_sse2.asm | 448 +++ .../vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm | 422 ++ 23 files changed, 18450 insertions(+) create mode 100644 media/libvpx/vp9/common/x86/convolve.h create mode 100644 media/libvpx/vp9/common/x86/vp9_asm_stubs.c create mode 100644 media/libvpx/vp9/common/x86/vp9_copy_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm (limited to 'media/libvpx/vp9/common/x86') diff --git a/media/libvpx/vp9/common/x86/convolve.h b/media/libvpx/vp9/common/x86/convolve.h new file mode 100644 index 000000000..de2df47e5 --- /dev/null +++ b/media/libvpx/vp9/common/x86/convolve.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_X86_CONVOLVE_H_ +#define VP9_COMMON_X86_CONVOLVE_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +typedef void filter8_1dfunction ( + const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter +); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, \ + ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 7, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 1, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } else { \ + vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VP9_COMMON_X86_CONVOLVE_H_ diff --git a/media/libvpx/vp9/common/x86/vp9_asm_stubs.c b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c new file mode 100644 index 000000000..fd55fb8c6 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction vp9_filter_block1d16_v8_sse2; +filter8_1dfunction vp9_filter_block1d16_h8_sse2; +filter8_1dfunction vp9_filter_block1d8_v8_sse2; +filter8_1dfunction vp9_filter_block1d8_h8_sse2; +filter8_1dfunction vp9_filter_block1d4_v8_sse2; +filter8_1dfunction vp9_filter_block1d4_h8_sse2; +filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; + +filter8_1dfunction vp9_filter_block1d16_v2_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; + +// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); + +// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_ , sse2); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2; + +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; + +// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + sse2); + +// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_ , sse2); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +#endif // HAVE_SSE2 diff --git a/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm new file mode 100644 index 000000000..b26383708 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm @@ -0,0 +1,156 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1 +INIT_XMM sse2 +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ + fx, fxs, fy, fys, w, h + mov r4d, dword wm + cmp r4d, 4 + je .w4 + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+32] + pavgb m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq +16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +INIT_MMX sse +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavgb m0, m4 + pavgb m1, m5 + pavgb m2, m6 + pavgb m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endmacro + +convolve_fn copy +convolve_fn avg diff --git a/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm new file mode 100644 index 000000000..b12d29c0a --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm @@ -0,0 +1,476 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_MMX sse +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, one + mov oned, 0x0001 + pxor m1, m1 + movd m3, oned + pshufw m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshufw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + mova m5, [leftq] + mova m6, [leftq+16] + mova m7, [leftq+32] + mova m8, [leftq+48] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + paddw m0, m6 + paddw m0, m7 + paddw m0, m8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET +%endif + +INIT_MMX sse +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + movq m0, [aboveq] + pshufw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + movd m3, oned + movd m4, bpsd + pshufw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -2 + mova m2, m3 + psllw m3, m4 + add leftq, 8 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movq m1, [leftq+lineq*4] + movq m2, [leftq+lineq*4+2] + pshufw m1, m1, 0x0 + pshufw m2, m2, 0x0 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m1 + movq [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bpsd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m7, m7 + pxor m8, m8 + pinsrw m7, oned, 0 + pinsrw m8, bpsd, 0 + pshuflw m7, m7, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m7, m7 + mov lineq, -8 + mova m5, m7 + punpcklqdq m2, m2 + psllw m7, m8 + add leftq, 32 + psubw m7, m5 ; max possible value + pxor m8, m8 ; min possible value + psubw m0, m2 + psubw m1, m2 +.loop: + movd m2, [leftq+lineq*4] + movd m3, [leftq+lineq*4+2] + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m4, m2, m0 + paddw m5, m3, m0 + paddw m2, m1 + paddw m3, m1 + ;Clamp to the bit-depth + pminsw m4, m7 + pminsw m5, m7 + pminsw m2, m7 + pminsw m3, m7 + pmaxsw m4, m8 + pmaxsw m5, m8 + pmaxsw m2, m8 + pmaxsw m3, m8 + ;Store the values + mova [dstq ], m4 + mova [dstq+strideq*2 ], m5 + mova [dstq +16], m2 + mova [dstq+strideq*2+16], m3 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m10, m10 + pxor m11, m11 + pinsrw m10, oned, 0 + pinsrw m11, bpsd, 0 + pshuflw m10, m10, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m10, m10 + mov lineq, -16 + mova m5, m10 + punpcklqdq m0, m0 + psllw m10, m11 + add leftq, 64 + psubw m10, m5 ; max possible value + pxor m11, m11 ; min possible value + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m5, [leftq+lineq*4] + movd m6, [leftq+lineq*4+2] + pshuflw m5, m5, 0x0 + pshuflw m6, m6, 0x0 + punpcklqdq m5, m5 + punpcklqdq m6, m6 + paddw m7, m5, m1 + paddw m8, m5, m2 + paddw m9, m5, m3 + paddw m5, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m5, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m5, m11 + ;Store these values + mova [dstq ], m7 + mova [dstq +16], m8 + mova [dstq +32], m9 + mova [dstq +48], m5 + paddw m7, m6, m1 + paddw m8, m6, m2 + paddw m9, m6, m3 + paddw m6, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m6, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m6, m11 + ;Store these values + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m8 + mova [dstq+strideq*2+32], m9 + mova [dstq+strideq*2+48], m6 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET +%endif diff --git a/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c new file mode 100644 index 000000000..b40669c63 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c @@ -0,0 +1,1215 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vpx_ports/emmintrin_compat.h" + +static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { + __m128i ubounded; + __m128i lbounded; + __m128i retval; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i t80, max, min; + + if (bd == 8) { + t80 = _mm_set1_epi16(0x80); + max = _mm_subs_epi16( + _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); + } else if (bd == 10) { + t80 = _mm_set1_epi16(0x200); + max = _mm_subs_epi16( + _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); + } else { // bd == 12 + t80 = _mm_set1_epi16(0x800); + max = _mm_subs_epi16( + _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); + } + + min = _mm_subs_epi16(zero, t80); + + ubounded = _mm_cmpgt_epi16(value, max); + lbounded = _mm_cmplt_epi16(value, min); + retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); + ubounded = _mm_and_si128(ubounded, max); + lbounded = _mm_and_si128(lbounded, min); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_or_si128(retval, lbounded); + return retval; +} + +// TODO(debargha, peter): Break up large functions into smaller ones +// in this file. +static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, + int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, + int bd) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i blimit, limit, thresh; + __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; + __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; + __m128i ps1, qs1, ps0, qs0; + __m128i abs_p0q0, abs_p1q1, ffff, work; + __m128i filt, work_a, filter1, filter2; + __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; + __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; + __m128i flat2_q0, flat2_p0; + __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3; + __m128i t4, t3, t80, t1; + __m128i eight, four; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + } + + q4 = _mm_load_si128((__m128i *)(s + 4 * p)); + p4 = _mm_load_si128((__m128i *)(s - 5 * p)); + q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + + // highbd_filter_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + + // highbd_hev_mask (in C code this is actually called from highbd_filter4) + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0), + _mm_subs_epu16(p0, p1)), + _mm_or_si128(_mm_subs_epu16(q1, q0), + _mm_subs_epu16(q0, q1))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), + _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), + _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2), + _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), + _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + + // lp filter + // highbd_filter4 + t4 = _mm_set1_epi16(4); + t3 = _mm_set1_epi16(3); + if (bd == 8) + t80 = _mm_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm_set1_epi16(0x200); + else // bd == 12 + t80 = _mm_set1_epi16(0x800); + + t1 = _mm_set1_epi16(0x1); + + ps1 = _mm_subs_epi16(p1, t80); + qs1 = _mm_subs_epi16(q1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + + filt = _mm_and_si128( + signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_and_si128(filt, mask); + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + filter1 = _mm_srai_epi16(filter1, 0x3); + filter2 = _mm_srai_epi16(filter2, 0x3); + + qs0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), + t80); + ps0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), + t80); + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(hev, filt); + qs1 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + ps1 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + // end highbd_filter4 + // loopfilter done + + // highbd_flat_mask4 + flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0), + _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p0), + _mm_subs_epu16(p0, p3))); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0), + _mm_subs_epu16(q0, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q0), + _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + flat = _mm_max_epi16(work, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + // end flat_mask4 + + // flat & mask = flat && mask (as used in filter8) + // (because, in both vars, each block of 16 either all 1s or all 0s) + flat = _mm_and_si128(flat, mask); + + p5 = _mm_load_si128((__m128i *)(s - 6 * p)); + q5 = _mm_load_si128((__m128i *)(s + 5 * p)); + p6 = _mm_load_si128((__m128i *)(s - 7 * p)); + q6 = _mm_load_si128((__m128i *)(s + 6 * p)); + p7 = _mm_load_si128((__m128i *)(s - 8 * p)); + q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + + // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 + // but referred to as p0-p4 & q0-q4 in fn) + flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0), + _mm_subs_epu16(p0, p4)), + _mm_or_si128(_mm_subs_epu16(q4, q0), + _mm_subs_epu16(q0, q4))); + + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0), + _mm_subs_epu16(p0, p5)), + _mm_or_si128(_mm_subs_epu16(q5, q0), + _mm_subs_epu16(q0, q5))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0), + _mm_subs_epu16(p0, p6)), + _mm_or_si128(_mm_subs_epu16(q6, q0), + _mm_subs_epu16(q0, q6))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0), + _mm_subs_epu16(p0, p7)), + _mm_or_si128(_mm_subs_epu16(q7, q0), + _mm_subs_epu16(q0, q7))); + flat2 = _mm_max_epi16(work, flat2); + + if (bd == 8) + flat2 = _mm_subs_epu16(flat2, one); + else if (bd == 10) + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + + flat2 = _mm_cmpeq_epi16(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // end highbd_flat_mask5 + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + eight = _mm_set1_epi16(8); + four = _mm_set1_epi16(4); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), + _mm_add_epi16(p4, p3)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), + _mm_add_epi16(q4, q3)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, + pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, + pixetFilter_q2q1q0)); + flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(p7, p0)), 4); + flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(q7, q0)), 4); + flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3, p0)), 3); + flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3, q0)), 3); + + sum_p7 = _mm_add_epi16(p7, p7); + sum_q7 = _mm_add_epi16(q7, q7); + sum_p3 = _mm_add_epi16(p3, p3); + sum_q3 = _mm_add_epi16(q3, q3); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); + flat2_p1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); + flat2_q1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); + flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1)), 3); + flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + sum_p3 = _mm_add_epi16(sum_p3, p3); + sum_q3 = _mm_add_epi16(sum_q3, q3); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); + flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p2)), 4); + flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q2)), 4); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); + flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2)), 3); + flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); + flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p3)), 4); + flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q3)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); + flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p4)), 4); + flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q4)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); + flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p5)), 4); + flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q5)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); + flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p6)), 4); + flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q6)), 4); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // highbd_filter8 + p2 = _mm_andnot_si128(flat, p2); + // p2 remains unchanged if !(flat && mask) + flat_p2 = _mm_and_si128(flat, flat_p2); + // when (flat && mask) + p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values + + ps1 = _mm_andnot_si128(flat, ps1); + // p1 takes the value assigned to in in filter4 if !(flat && mask) + flat_p1 = _mm_and_si128(flat, flat_p1); + // when (flat && mask) + p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values + qs1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values + + ps0 = _mm_andnot_si128(flat, ps0); + // p0 takes the value assigned to in in filter4 if !(flat && mask) + flat_p0 = _mm_and_si128(flat, flat_p0); + // when (flat && mask) + p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values + qs0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values + // end highbd_filter8 + + // highbd_filter16 + p6 = _mm_andnot_si128(flat2, p6); + // p6 remains unchanged if !(flat2 && flat && mask) + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + // get values for when (flat2 && flat && mask) + p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values + q6 = _mm_andnot_si128(flat2, q6); + // q6 remains unchanged if !(flat2 && flat && mask) + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + // get values for when (flat2 && flat && mask) + q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values + _mm_store_si128((__m128i *)(s - 7 * p), p6); + _mm_store_si128((__m128i *)(s + 6 * p), q6); + + p5 = _mm_andnot_si128(flat2, p5); + // p5 remains unchanged if !(flat2 && flat && mask) + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + // get values for when (flat2 && flat && mask) + p5 = _mm_or_si128(p5, flat2_p5); + // full list of p5 values + q5 = _mm_andnot_si128(flat2, q5); + // q5 remains unchanged if !(flat2 && flat && mask) + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + // get values for when (flat2 && flat && mask) + q5 = _mm_or_si128(q5, flat2_q5); + // full list of q5 values + _mm_store_si128((__m128i *)(s - 6 * p), p5); + _mm_store_si128((__m128i *)(s + 5 * p), q5); + + p4 = _mm_andnot_si128(flat2, p4); + // p4 remains unchanged if !(flat2 && flat && mask) + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + // get values for when (flat2 && flat && mask) + p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values + q4 = _mm_andnot_si128(flat2, q4); + // q4 remains unchanged if !(flat2 && flat && mask) + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + // get values for when (flat2 && flat && mask) + q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values + _mm_store_si128((__m128i *)(s - 5 * p), p4); + _mm_store_si128((__m128i *)(s + 4 * p), q4); + + p3 = _mm_andnot_si128(flat2, p3); + // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + // get values for when (flat2 && flat && mask) + p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values + q3 = _mm_andnot_si128(flat2, q3); + // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + // get values for when (flat2 && flat && mask) + q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values + _mm_store_si128((__m128i *)(s - 4 * p), p3); + _mm_store_si128((__m128i *)(s + 3 * p), q3); + + p2 = _mm_andnot_si128(flat2, p2); + // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + // get values for when (flat2 && flat && mask) + p2 = _mm_or_si128(p2, flat2_p2); + // full list of p2 values + q2 = _mm_andnot_si128(flat2, q2); + // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + // get values for when (flat2 && flat && mask) + q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s + 2 * p), q2); + + p1 = _mm_andnot_si128(flat2, p1); + // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + // get values for when (flat2 && flat && mask) + p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values + q1 = _mm_andnot_si128(flat2, q1); + // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + // get values for when (flat2 && flat && mask) + q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + + p0 = _mm_andnot_si128(flat2, p0); + // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + // get values for when (flat2 && flat && mask) + p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values + q0 = _mm_andnot_si128(flat2, q0); + // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + // get values for when (flat2 && flat && mask) + q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s - 0 * p), q0); +} + +static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s, + int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, + int bd) { + highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); + highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh, + bd); +} + +// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. +void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, + int count, int bd) { + if (count == 1) + highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); + else + highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd); +} + +void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, + int count, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft; + + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + (void)count; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_set1_epi16(0x200); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_set1_epi16(0x800); + } + + ps1 = _mm_subs_epi16(p1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + qs1 = _mm_subs_epi16(q1, t80); + + // filter_mask and hev_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), + _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), + _mm_subs_epu16(q0, q1)); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), + _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), + _mm_subs_epu16(q1, p1)); + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), + _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), + _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2), + _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), + _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0), + _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q0), + _mm_subs_epu16(q0, q2))); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0), + _mm_subs_epu16(p0, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q0), + _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + + // lp filter + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = signed_char_clamp_bd_sse2(filt, bd); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi16(filt, t4); + filter2 = _mm_adds_epi16(filt, t3); + + // Filter1 >> 3 + filter1 = signed_char_clamp_bd_sse2(filter1, bd); + filter1 = _mm_srai_epi16(filter1, 3); + + // Filter2 >> 3 + filter2 = signed_char_clamp_bd_sse2(filter2, bd); + filter2 = _mm_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm_andnot_si128(hev, filt); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_adds_epi16(work_a, t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(work_a, t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s + 0 * p), q0); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s + 2 * p), q2); +} + +void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1, + int bd) { + vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); + vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, + 1, bd); +} + +void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, + int count, int bd) { + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), + _mm_subs_epu16(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), + _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), + _mm_subs_epu16(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), + _mm_subs_epu16(q1, p1)); + __m128i work; + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + __m128i tff80; + __m128i tffe0; + __m128i t1f; + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i t7f; + // equivalent to shifting 0x7f left by bitdepth - 8 + // and setting new bits to 1 + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + (void)count; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + tff80 = _mm_set1_epi16(0xff80); + tffe0 = _mm_set1_epi16(0xffe0); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); + } + + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), + _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p2), + _mm_subs_epu16(p2, p3))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1), + _mm_subs_epu16(q1, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q2), + _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // filter4 + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm_and_si128(filter1, t1f); // clamp the range + filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, tffe0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + work_a = _mm_cmpgt_epi16(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, tff80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + q1 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80); + p0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + p1 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1, + int bd) { + vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); + vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1, + bd); +} + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, + uint16_t *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0*out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1*out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2*out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3*out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4*out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5*out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6*out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7*out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, + int in_p, uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} + +void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + (void)count; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, + bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + (void)count; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, + bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); + uint16_t *src[2]; + uint16_t *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + highbd_transpose(src, p, dst, 8, 2); + + // Loop filtering + highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, + thresh, bd); + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + highbd_transpose(src, 8, dst, p, 2); +} + +void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, + int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm new file mode 100644 index 000000000..29ec151ed --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm @@ -0,0 +1,962 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +;void vp9_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 1, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 1, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..93784121c --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,494 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_highbd_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c new file mode 100644 index 000000000..ce010df3b --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -0,0 +1,4223 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" +#include "vp9/common/vp9_idct.h" + +#define RECON_AND_STORE4X4(dest, in_x) \ +{ \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ +} + +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16( + (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = _mm_load_si128((const __m128i *)input); + input2 = _mm_load_si128((const __m128i *)(input + 8)); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + + input1 = _mm_unpackhi_epi32(input0, input0); + input0 = _mm_unpacklo_epi32(input0, input0); + input3 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpacklo_epi32(input2, input2); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + } +} + +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest + 0 * stride, dc_value); + RECON_AND_STORE4X4(dest + 1 * stride, dc_value); + RECON_AND_STORE4X4(dest + 2 * stride, dc_value); + RECON_AND_STORE4X4(dest + 3 * stride, dc_value); +} + +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +static void idct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +static void iadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + transpose_4x4(in); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[2]; + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + + in[0] = _mm_loadu_si128((const __m128i *)(input)); + in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); + + switch (tx_type) { + case 0: // DCT_DCT + idct4_sse2(in); + idct4_sse2(in); + break; + case 1: // ADST_DCT + idct4_sse2(in); + iadst4_sse2(in); + break; + case 2: // DCT_ADST + iadst4_sse2(in); + idct4_sse2(in); + break; + case 3: // ADST_ADST + iadst4_sse2(in); + iadst4_sse2(in); + break; + default: + assert(0); + break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ + out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } + +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); +} + +static void idct8_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); +} + +static void iadst8_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + +void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + // load input data + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + + switch (tx_type) { + case 0: // DCT_DCT + idct8_sse2(in); + idct8_sse2(in); + break; + case 1: // ADST_DCT + idct8_sse2(in); + iadst8_sse2(in); + break; + case 2: // DCT_ADST + iadst8_sse2(in); + idct8_sse2(in); + break; + case 3: // ADST_ADST + iadst8_sse2(in); + iadst8_sse2(in); + break; + default: + assert(0); + break; + } + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); +} + +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); + + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, + in0, in1, in2, in3, in4, in5, in6, in7); + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ + stg2_0, stg2_1, stg2_6, stg2_7, \ + stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ + stg3_0, stg3_1, \ + stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ + stg4_0, stg4_1, \ + stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[16], l[16], r[16], *curr1; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct + + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); + + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + int j; + // 1-D idct + array_transpose_8x8(l + i * 8, in); + array_transpose_8x8(r + i * 8, in + 8); + + IDCT16 + + // 2-D + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 2; ++i) { + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); + RECON_AND_STORE(dest + 8 * stride, dc_value); + RECON_AND_STORE(dest + 9 * stride, dc_value); + RECON_AND_STORE(dest + 10 * stride, dc_value); + RECON_AND_STORE(dest + 11 * stride, dc_value); + RECON_AND_STORE(dest + 12 * stride, dc_value); + RECON_AND_STORE(dest + 13 * stride, dc_value); + RECON_AND_STORE(dest + 14 * stride, dc_value); + RECON_AND_STORE(dest + 15 * stride, dc_value); + dest += 8; + } +} + +static void iadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +static void idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +static void iadst16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} + +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case 0: // DCT_DCT + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + case 1: // ADST_DCT + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + case 2: // DCT_ADST + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + case 3: // ADST_ADST + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + default: + assert(0); + break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + array_transpose_4X8(l + 8 * i, in); + + IDCT16_10 + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = _mm_load_si128((const __m128i *) input); \ + input += 8; \ + } \ + +#define IDCT32_34 \ +/* Stage1 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ + stg1_1, stp1_16, stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ + stg1_7, stp1_19, stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ + stg1_9, stp1_20, stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ + stg1_15, stp1_23, stp1_24); \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ + stg2_1, stp2_8, stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ + stg2_7, stp2_11, stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ + stg3_1, stp1_4, stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ + stg4_1, stp2_0, stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + + +#define IDCT32 \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ + stp1_17, stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ + stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ + stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + +// Only upper-left 8x8 has non-zero coeff +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[32]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 32)); + in[2] = _mm_load_si128((const __m128i *)(input + 64)); + in[3] = _mm_load_si128((const __m128i *)(input + 96)); + in[4] = _mm_load_si128((const __m128i *)(input + 128)); + in[5] = _mm_load_si128((const __m128i *)(input + 160)); + in[6] = _mm_load_si128((const __m128i *)(input + 192)); + in[7] = _mm_load_si128((const __m128i *)(input + 224)); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } + + array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32_34 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { + int j; + const __m128i zero = _mm_setzero_si128(); + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + IDCT32_34 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[128], zero_idx[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + for (i = 0; i < 4; i++) { + i32 = (i << 5); + // First 1-D idct + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } + for (i = 0; i < 4; i++) { + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + IDCT32 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 4; ++i) { + int j; + for (j = 0; j < 32; ++j) { + RECON_AND_STORE(dest + j * stride, dc_value); + } + dest += 8; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct4(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vp9_highbd_idct4(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} + +void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_8x8(inptr, inptr); + for (i = 0; i < 8; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 8; ++i) { + vp9_highbd_idct8(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct8(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vp9_highbd_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_16x16(inptr, inptr + 16); + for (i = 0; i < 16; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 16; ++i) { + vp9_highbd_idct16(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vp9_highbd_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // Since all non-zero dct coefficients are in upper-left 4x4 area, + // we only need to consider first 4 rows here. + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform (N.B. This transposes inptr) + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 16; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_8x8(inptr, inptr); + array_transpose_8x8(inptr + 8, inptr + 16); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vp9_highbd_idct16(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vp9_highbd_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h new file mode 100644 index 000000000..984363d40 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 10 * stride, in[10]); + RECON_AND_STORE(dest + 11 * stride, in[11]); + RECON_AND_STORE(dest + 12 * stride, in[12]); + RECON_AND_STORE(dest + 13 * stride, in[13]); + RECON_AND_STORE(dest + 14 * stride, in[14]); + RECON_AND_STORE(dest + 15 * stride, in[15]); +} diff --git a/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm new file mode 100644 index 000000000..2c1060710 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the inverse transformation. Part +; of the functions are originally derived from the ffmpeg project. +; Note that the current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 +pw_16: times 8 dw 16 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 +%endmacro + +TRANSFORM_COEFFS 6270, 15137 +TRANSFORM_COEFFS 3196, 16069 +TRANSFORM_COEFFS 13623, 9102 + +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro IDCT8_1D 0 + SUM_SUB 0, 4, 9 + BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 + + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + SUM_SUB 0, 6, 9 + SUM_SUB 4, 2, 9 + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 4, 3, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 4 +%endmacro + +; This macro handles 8 pixels per line +%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero + paddw m%1, m11 + paddw m%2, m11 + psraw m%1, 5 + psraw m%2, 5 + + movh m%3, [outputq] + movh m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [outputq], m%3 + movh [outputq + strideq], m%4 +%endmacro + +INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform +cglobal idct8x8_64_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + mova m4, [inputq + 64] + mova m5, [inputq + 80] + mova m6, [inputq + 96] + mova m7, [inputq + 112] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +%endif diff --git a/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm new file mode 100644 index 000000000..22b573188 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm @@ -0,0 +1,667 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +INIT_MMX sse +cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + punpckldq m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_MMX sse +cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + pxor m2, m2 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + pxor m2, m2 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_MMX sse +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_MMX sse +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movd m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 + pshufw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -2 + add leftq, 4 + psubw m0, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshufw m2, m2, 0x0 + pshufw m3, m3, 0x0 + paddw m2, m0 + paddw m3, m0 + packuswb m2, m2 + packuswb m3, m3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 + add leftq, 8 + psubw m0, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m2, m0 + paddw m3, m0 + packuswb m2, m3 + movq [dstq ], m2 + movhps [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + punpcklbw m2, m1 + punpckhbw m4, m0, m1 + punpcklbw m0, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 + punpcklqdq m2, m2 + add leftq, 16 + psubw m0, m2 + psubw m4, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m5, m2, m0 + paddw m6, m3, m0 + paddw m2, m4 + paddw m3, m4 + packuswb m5, m2 + packuswb m6, m3 + mova [dstq ], m5 + mova [dstq+strideq], m6 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m6, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m6, m1 + pshuflw m2, m2, 0x0 + pshuflw m6, m6, 0x0 + punpcklqdq m2, m2 + punpcklqdq m6, m6 + paddw m7, m2, m0 + paddw m8, m2, m3 + paddw m9, m2, m4 + paddw m2, m5 + packuswb m7, m8 + packuswb m9, m2 + paddw m2, m6, m0 + paddw m8, m6, m3 + mova [dstq ], m7 + paddw m7, m6, m4 + paddw m6, m5 + mova [dstq +16], m9 + packuswb m2, m8 + packuswb m7, m6 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m7 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET +%endif diff --git a/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm new file mode 100644 index 000000000..88df9b2d1 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -0,0 +1,1036 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +SECTION .text + +INIT_MMX ssse3 +cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 4 + mov lineq, -2 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + movd [dstq ], m1 + movd [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_MMX ssse3 +cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 8 + mov lineq, -4 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + movq [dstq ], m1 + movq [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM ssse3 +cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 16 + mov lineq, -8 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + mova [dstq ], m1 + mova [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM ssse3 +cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 32 + mov lineq, -16 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + mova [dstq ], m1 + mova [dstq +16], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_MMX ssse3 +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + pshufb m2, m0, [GLOBAL(sh_b23456777)] + pshufb m1, m0, [GLOBAL(sh_b01234577)] + pshufb m0, [GLOBAL(sh_b12345677)] + pavgb m3, m2, m1 + pxor m2, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m0, m3 + + ; store 4 lines + movd [dstq ], m0 + psrlq m0, 8 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + psrlq m0, 8 + movd [dstq ], m0 + psrlq m0, 8 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_MMX ssse3 +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + mova m1, [GLOBAL(sh_b12345677)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m2, m0, [GLOBAL(sh_b23456777)] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m0, m3 + + ; store 4 lines + movq [dstq ], m0 + pshufb m0, m1 + movq [dstq+strideq ], m0 + pshufb m0, m1 + movq [dstq+strideq*2], m0 + pshufb m0, m1 + movq [dstq+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq+strideq*4] + + ; store next 4 lines + movq [dstq ], m0 + pshufb m0, m1 + movq [dstq+strideq ], m0 + pshufb m0, m1 + movq [dstq+strideq*2], m0 + pshufb m0, m1 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, dst8, line + lea stride3q, [strideq*3] + lea dst8q, [dstq+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m0, m3 + + ; first 4 lines and first half of 3rd 4 lines + mov lined, 2 +.loop: + mova [dstq ], m0 + movhps [dst8q ], m0 + pshufb m0, m1 + mova [dstq +strideq ], m0 + movhps [dst8q+strideq ], m0 + pshufb m0, m1 + mova [dstq +strideq*2 ], m0 + movhps [dst8q+strideq*2 ], m0 + pshufb m0, m1 + mova [dstq +stride3q ], m0 + movhps [dst8q+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + dec lined + jnz .loop + + ; bottom-right 8x8 block + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + lea dstq, [dstq+strideq*4] + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m4, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, dst16, line + lea stride3q, [strideq*3] + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m4 + pxor m2, m4 + palignr m5, m4, m0, 1 + palignr m6, m4, m0, 2 + pshufb m4, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m4, m3 + pavgb m3, m0, m6 + pxor m0, m6 + pand m0, [GLOBAL(pb_1)] + psubb m3, m0 + pavgb m5, m3 + + ; write 4x4 lines (and the first half of the second 4x4 lines) + mov lined, 4 +.loop: + mova [dstq ], m5 + mova [dstq +16], m4 + mova [dst16q ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +strideq ], m3 + mova [dstq +strideq +16], m4 + mova [dst16q+strideq ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + mova [dstq +strideq*2 ], m5 + mova [dstq +strideq*2+16], m4 + mova [dst16q+strideq*2 ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +stride3q ], m3 + mova [dstq +stride3q +16], m4 + mova [dst16q+stride3q ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + lea dstq, [dstq +strideq*4] + lea dst16q, [dst16q+strideq*4] + dec lined + jnz .loop + + ; write second half of second 4x4 lines + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + + RESTORE_GOT + RET + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + pshufb m3, [GLOBAL(sh_b0123456777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + lea dstq, [dstq+strideq*4] + psrldq m3, 1 + psrldq m4, 1 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m0, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 + pavgb m0, m3 + + mov lined, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m4 + pshufb m0, m1 + pshufb m4, m1 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m4 + pshufb m0, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m7, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b123456789abcdeff)] + lea stride3q, [strideq*3] + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m7, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 + palignr m6, m7, m0, 1 + palignr m5, m7, m0, 2 + pavgb m7, m3 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 + pavgb m0, m6 + + mov lined, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m7 + mova [dstq+strideq ], m2 + mova [dstq+strideq +16], m4 + palignr m3, m7, m0, 1 + palignr m5, m4, m2, 1 + pshufb m7, m1 + pshufb m4, m1 + + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m5 + mova [dstq+stride3q +16], m4 + palignr m0, m7, m3, 1 + palignr m2, m4, m5, 1 + pshufb m7, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET + +INIT_MMX ssse3 +cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; abcd [byte] + pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] + pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 + pavgb m1, m0 ; ab, bc, cd, d [byte] + + punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d + movd [dstq ], m1 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d + movd [dstq+strideq], m1 + lea dstq, [dstq+strideq*2] + psrlq m1, 16 ; cd, c3d, d, d + movd [dstq ], m1 + pshufw m1, m1, q1111 ; d, d, d, d + movd [dstq+strideq], m1 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset + GET_GOT goffsetq + movq m3, [leftq] ; abcdefgh [byte] + lea stride3q, [strideq*3] + + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 + pavgb m0, m2 + punpcklbw m0, m3 ; interleaved output + + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh + psrldq m0, 2 + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m0, [leftq] ; abcdefghijklmnop [byte] + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] + + punpckhbw m4, m1, m3 ; interleaved input + punpcklbw m1, m3 ; interleaved output + mova [dstq ], m1 + palignr m3, m4, m1, 2 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 4 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 6 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + palignr m3, m4, m1, 8 + mova [dstq ], m3 + palignr m3, m4, m1, 10 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 12 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 14 + mova [dstq+stride3q ], m3 + DEFINE_ARGS dst, stride, stride3, line + mov lined, 2 + mova m0, [GLOBAL(sh_b23456789abcdefff)] +.loop: + lea dstq, [dstq+strideq*4] + mova [dstq ], m4 + pshufb m4, m0 + mova [dstq+strideq ], m4 + pshufb m4, m0 + mova [dstq+strideq*2], m4 + pshufb m4, m0 + mova [dstq+stride3q ], m4 + pshufb m4, m0 + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m1, [leftq] ; 0-15 [byte] + mova m2, [leftq+16] ; 16-31 [byte] + pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] + pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 + palignr m6, m2, m1, 1 + palignr m5, m2, m1, 2 + pavgb m2, m4 ; high 16px even lines + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 + pavgb m1, m6 ; low 16px even lines + + punpckhbw m6, m1, m0 ; interleaved output 2 + punpcklbw m1, m0 ; interleaved output 1 + + punpckhbw m7, m2, m3 ; interleaved output 4 + punpcklbw m2, m3 ; interleaved output 3 + + ; output 1st 8 lines (and half of 2nd 8 lines) + DEFINE_ARGS dst, stride, stride3, dst8 + lea dst8q, [dstq+strideq*8] + mova [dstq ], m1 + mova [dstq +16], m6 + mova [dst8q ], m6 + palignr m0, m6, m1, 2 + palignr m4, m2, m6, 2 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 4 + palignr m4, m2, m6, 4 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 6 + palignr m4, m2, m6, 6 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m0, m6, m1, 8 + palignr m4, m2, m6, 8 + mova [dstq ], m0 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m0, m6, m1, 10 + palignr m4, m2, m6, 10 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 12 + palignr m4, m2, m6, 12 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 14 + palignr m4, m2, m6, 14 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines + mova [dstq +16], m2 + mova [dst8q ], m2 + palignr m4, m7, m2, 2 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 4 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 6 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m4, m7, m2, 8 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m4, m7, m2, 10 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 12 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 14 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 3rd 8 lines and half of 4th 8 lines + mova m0, [GLOBAL(sh_b23456789abcdefff)] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + + ; output last half of 4th 8 lines + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + lea dstq, [dstq+strideq*4] + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + + ; done! + RESTORE_GOT + RET diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c new file mode 100644 index 000000000..770a65f4c --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -0,0 +1,986 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* AVX2 */ + +#include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" + +static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128( + _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *) (s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *) (s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), + _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), + _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *) (s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), + _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), + _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, + _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, + _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, + _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), + 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, + q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, + p256_0, q256_0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 5 * p))); + p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 4 * p))); + p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 3 * p))); + p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 2 * p))); + p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 1 * p))); + q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 0 * p))); + q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 1 * p))); + q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 2 * p))); + q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 3 * p))); + q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, + flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, + flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, + res_q; + + const __m256i filter = _mm256_load_si256( + (__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, + _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, + _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16(eight, + _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(four, + _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *) (s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *) (s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *) (s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *) (s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *) (s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *) (s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *) (s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *) (s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *) (s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *) (s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *) (s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *) (s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *) (s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *) (s + 6 * p), q6); + } +} + +void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int count) { + if (count == 1) + mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); + else + mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); +} diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c new file mode 100644 index 000000000..e321dbebe --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -0,0 +1,1580 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vpx_ports/emmintrin_compat.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), + (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), + (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), + (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), + (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), + (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), + abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + // Filter1 >> 3 + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *)(s + 6 * p))); + flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *)(s + 7 * p))); + work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero);; + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, + pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, + pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), + _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), + _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); + + const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 + f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), + _mm_add_epi16(p4_lo, f_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 + f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), + _mm_add_epi16(p4_hi, f_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); + + p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); + q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } +} + +// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. +void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { + if (count == 1) + mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); + else + mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); +} + +void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + + (void)count; + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), + abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), + abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + } +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + // Read in 16 lines + x0 = _mm_loadl_epi64((__m128i *)in0); + x8 = _mm_loadl_epi64((__m128i *)in1); + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); + + x0 = _mm_unpacklo_epi8(x0, x1); + x1 = _mm_unpacklo_epi8(x2, x3); + x2 = _mm_unpacklo_epi8(x4, x5); + x3 = _mm_unpacklo_epi8(x6, x7); + + x8 = _mm_unpacklo_epi8(x8, x9); + x9 = _mm_unpacklo_epi8(x10, x11); + x10 = _mm_unpacklo_epi8(x12, x13); + x11 = _mm_unpacklo_epi8(x14, x15); + + x4 = _mm_unpacklo_epi16(x0, x1); + x5 = _mm_unpacklo_epi16(x2, x3); + x12 = _mm_unpacklo_epi16(x8, x9); + x13 = _mm_unpacklo_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 0*out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1*out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + _mm_storel_pd((double *)(out + 2*out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3*out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 4*out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5*out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + _mm_storel_pd((double *)(out + 6*out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7*out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void vp9_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); + unsigned char *src[1]; + unsigned char *dst[1]; + (void)count; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, p, dst, 8, 1); + + // Loop filtering + vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void vp9_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm new file mode 100644 index 000000000..f5f7d5af7 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -0,0 +1,611 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp9_lpf_horizontal_4_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_lpf_horizontal_4_mmx) PRIVATE +sym(vp9_lpf_horizontal_4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_lpf_vertical_4_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_lpf_vertical_4_mmx) PRIVATE +sym(vp9_lpf_vertical_4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; transpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 diff --git a/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm new file mode 100644 index 000000000..6029420d1 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -0,0 +1,287 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; This file is a duplicate of mfqe_sse2.asm in VP8. +; TODO(jackychen): Find a way to fix the duplicate. +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp9_filter_by_weight16x16_sse2) PRIVATE +sym(vp9_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp9_filter_by_weight8x8_sse2) PRIVATE +sym(vp9_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE +sym(vp9_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 diff --git a/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm new file mode 100644 index 000000000..ec8bfdb18 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm @@ -0,0 +1,694 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_post_proc_down_and_across_xmm +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_xmm) PRIVATE +sym(vp9_post_proc_down_and_across_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ALIGN_STACK 16, rax + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movdqa xmm0, [GLOBAL(rd42)] + sub rsp, 16 + movdqa [rsp], xmm0 +%define RD42 [rsp] +%else +%define RD42 [GLOBAL(rd42)] +%endif + + + movd xmm2, dword ptr arg(6) ;flimit + punpcklwd xmm2, xmm2 + punpckldq xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor xmm0, xmm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 ; + + movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 + psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw xmm7, xmm2 + + movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + neg rax + movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi], xmm1 ; + + neg rax ; pitch is positive + add rsi, 8 + add rdi, 8 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + + jl .nextcol + + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-8]; + +.acrossnextcol: + movq xmm7, QWORD PTR [rdi +rdx -2] + movd xmm4, DWORD PTR [rdi +rdx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = p0..p3 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + movdqa xmm5, xmm4 ; mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + psrldq xmm4, 1 ; mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes + movdq2q mm0, xmm1 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + jl .acrossnextcol; + + ; last 8 pixels + movq QWORD PTR [rdi+rdx-8], mm0 + + ; done with this rwo + add rsi,rax ; next line + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + add rsp,16 + pop rsp +%endif + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef RD42 + + +;void vp9_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_xmm) PRIVATE +sym(vp9_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c + +#include "vp9/common/x86/convolve.h" +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# else // clang > 3.3, and not 5.0 on macosx. +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // clang <= 3.3 +#elif defined(__GNUC__) +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +# else // gcc > 4.7 +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // gcc <= 4.6 +#else // !(gcc || clang) +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr - 3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((const __m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((const __m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..5fd2857e1 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// tmmintrin.h. +#include "./vp9_rtcd.h" + +#include + +#include "vp9/common/x86/convolve.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; + +void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr+=src_pixels_per_line; + + // save only 4 bytes + *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes. + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + src_ptr+=src_pixels_per_line; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 16 bytes + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 16 bytes + srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); + srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + + // merge the result together + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); + + // merge the result together + srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); + srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_min_epi16(srcRegFilt3, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt6, srcRegFilt8)); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_max_epi16(srcRegFilt3, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt6, srcRegFilt8)); + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); + + src_ptr+=src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save 16 bytes convolve result + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; + +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; + +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm new file mode 100644 index 000000000..9dc8d0abb --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm @@ -0,0 +1,987 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +;void vp9_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_sse2) PRIVATE +sym(vp9_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_v8_sse2) PRIVATE +sym(vp9_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_sse2) PRIVATE +sym(vp9_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_sse2) PRIVATE +sym(vp9_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_sse2) PRIVATE +sym(vp9_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_sse2) PRIVATE +sym(vp9_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..4a5bf1b60 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -0,0 +1,1071 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro VERTx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movd xmm0, [rsi] ;A + movd xmm1, [rsi + rdx] ;B + movd xmm2, [rsi + rdx * 2] ;C + movd xmm3, [rax + rdx * 2] ;D + movd xmm4, [rsi + rdx * 4] ;E + movd xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movd xmm6, [rsi + rbx] ;G + movd xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + movdqa xmm1, xmm2 + paddsw xmm0, xmm6 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + +%macro VERTx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + + +%macro VERTx16 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 + + movq xmm0, [rsi + 8] ;A + movq xmm1, [rsi + rdx + 8] ;B + movq xmm2, [rsi + rdx * 2 + 8] ;C + movq xmm3, [rax + rdx * 2 + 8] ;D + movq xmm4, [rsi + rdx * 4 + 8] ;E + movq xmm5, [rax + rdx * 4 + 8] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx + 8] ;G + movq xmm7, [rax + rbx + 8] ;H + punpcklbw xmm6, xmm7 ;G H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movq xmm1, [rdi+8] + pavgb xmm0, xmm1 +%endif + + movq [rdi+8], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%macro HORIZx4_ROW 2 + movdqa %2, %1 + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pmaddubsw %1, k0k1k4k5 + pmaddubsw %2, k2k3k6k7 + + movdqa xmm4, %1 + movdqa xmm5, %2 + psrldq %1, 8 + psrldq %2, 8 + movdqa xmm6, xmm5 + + paddsw xmm4, %2 + pmaxsw xmm5, %1 + pminsw %1, xmm6 + paddsw %1, xmm4 + paddsw %1, xmm5 + + paddsw %1, krd + psraw %1, 7 + packuswb %1, %1 +%endm + +%macro HORIZx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm6, xmm4, 0b ;k0_k1 + pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 + pshuflw xmm7, xmm4, 01010101b ;k2_k3 + pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 + pshufd xmm5, xmm5, 0 ;rounding + + movdqa k0k1k4k5, xmm6 + movdqa k2k3k6k7, xmm7 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + shr rcx, 1 +.loop: + ;Do two rows once + movq xmm0, [rsi - 3] ;load src + movq xmm1, [rsi + 5] + movq xmm2, [rsi + rax - 3] + movq xmm3, [rsi + rax + 5] + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + + HORIZx4_ROW xmm0, xmm1 + HORIZx4_ROW xmm2, xmm3 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 + movd xmm3, [rdi + rdx] + pavgb xmm2, xmm3 +%endif + movd [rdi], xmm0 + movd [rdi +rdx], xmm2 + + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] + + dec rcx + jnz .loop + + ; Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done + + movq xmm0, [rsi - 3] ; load src + movq xmm1, [rsi + 5] + punpcklqdq xmm0, xmm1 + + HORIZx4_ROW xmm0, xmm1 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +.done +%endm + +%macro HORIZx8_ROW 4 + movdqa %2, %1 + movdqa %3, %1 + movdqa %4, %1 + + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pshufb %3, [GLOBAL(shuf_t4t5)] + pshufb %4, [GLOBAL(shuf_t6t7)] + + pmaddubsw %1, k0k1 + pmaddubsw %2, k2k3 + pmaddubsw %3, k4k5 + pmaddubsw %4, k6k7 + + paddsw %1, %4 + movdqa %4, %2 + pmaxsw %2, %3 + pminsw %3, %4 + paddsw %1, %3 + paddsw %1, %2 + + paddsw %1, krd + psraw %1, 7 + packuswb %1, %1 +%endm + +%macro HORIZx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + shr rcx, 1 + +.loop: + movq xmm0, [rsi - 3] ;load src + movq xmm3, [rsi + 5] + movq xmm4, [rsi + rax - 3] + movq xmm7, [rsi + rax + 5] + punpcklqdq xmm0, xmm3 + punpcklqdq xmm4, xmm7 + + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 + HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 +%if %1 + movq xmm1, [rdi] + movq xmm2, [rdi + rdx] + pavgb xmm0, xmm1 + pavgb xmm4, xmm2 +%endif + movq [rdi], xmm0 + movq [rdi + rdx], xmm4 + + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] + dec rcx + jnz .loop + + ;Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done + + movq xmm0, [rsi - 3] + movq xmm3, [rsi + 5] + punpcklqdq xmm0, xmm3 + + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 +.done +%endm + +%macro HORIZx16 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + prefetcht0 [rsi + 2 * rax -3] + + movq xmm0, [rsi - 3] ;load src data + movq xmm4, [rsi + 5] + movq xmm6, [rsi + 13] + punpcklqdq xmm0, xmm4 + punpcklqdq xmm4, xmm6 + + movdqa xmm7, xmm0 + + punpcklbw xmm7, xmm7 + punpckhbw xmm0, xmm0 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + + palignr xmm0, xmm7, 1 + palignr xmm1, xmm7, 5 + pmaddubsw xmm0, k0k1 + palignr xmm2, xmm7, 9 + pmaddubsw xmm1, k2k3 + palignr xmm3, xmm7, 13 + + pmaddubsw xmm2, k4k5 + pmaddubsw xmm3, k6k7 + paddsw xmm0, xmm3 + + movdqa xmm3, xmm4 + punpcklbw xmm3, xmm3 + punpckhbw xmm4, xmm4 + + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + + palignr xmm4, xmm3, 1 + palignr xmm5, xmm3, 5 + palignr xmm6, xmm3, 9 + palignr xmm7, xmm3, 13 + + movdqa xmm3, xmm1 + pmaddubsw xmm4, k0k1 + pmaxsw xmm1, xmm2 + pmaddubsw xmm5, k2k3 + pminsw xmm2, xmm3 + pmaddubsw xmm6, k4k5 + paddsw xmm0, xmm2 + pmaddubsw xmm7, k6k7 + paddsw xmm0, xmm1 + + paddsw xmm4, xmm7 + movdqa xmm7, xmm5 + pmaxsw xmm5, xmm6 + pminsw xmm6, xmm7 + paddsw xmm4, xmm6 + paddsw xmm4, xmm5 + + paddsw xmm0, krd + paddsw xmm4, krd + psraw xmm0, 7 + psraw xmm4, 7 + packuswb xmm0, xmm0 + packuswb xmm4, xmm4 + punpcklqdq xmm0, xmm4 +%if %1 + movdqa xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + lea rsi, [rsi + rax] + movdqa [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + + HORIZx4 0 + + add rsp, 16 * 3 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + + HORIZx4 1 + + add rsp, 16 * 3 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +SECTION_RODATA +align 16 +shuf_t0t1: + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +align 16 +shuf_t2t3: + db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +align 16 +shuf_t4t5: + db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +align 16 +shuf_t6t7: + db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..d94ccf2e9 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm @@ -0,0 +1,448 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..b5e18fe6d --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret -- cgit v1.2.3