diff options
author | Matt A. Tobin <email@mattatobin.com> | 2020-04-07 23:30:51 -0400 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2020-04-14 13:26:42 +0200 |
commit | 277f2116b6660e9bbe7f5d67524be57eceb49b8b (patch) | |
tree | 4595f7cc71418f71b9a97dfaeb03a30aa60f336a /media/libaom/src/aom_dsp/x86 | |
parent | d270404436f6e84ffa3b92af537ac721bf10d66e (diff) | |
download | UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.gz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.lz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.xz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.zip |
Move aom source to a sub-directory under media/libaom
There is no damned reason to treat this differently than any other media lib given its license and there never was.
Diffstat (limited to 'media/libaom/src/aom_dsp/x86')
93 files changed, 37601 insertions, 0 deletions
diff --git a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c new file mode 100644 index 000000000..5f5bf5f14 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction aom_filter_block1d16_v8_sse2; +filter8_1dfunction aom_filter_block1d16_h8_sse2; +filter8_1dfunction aom_filter_block1d8_v8_sse2; +filter8_1dfunction aom_filter_block1d8_h8_sse2; +filter8_1dfunction aom_filter_block1d4_v8_sse2; +filter8_1dfunction aom_filter_block1d4_h8_sse2; + +#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 +#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 +#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 +#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 +#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 +#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 + +filter8_1dfunction aom_filter_block1d16_v2_sse2; +filter8_1dfunction aom_filter_block1d16_h2_sse2; +filter8_1dfunction aom_filter_block1d8_v2_sse2; +filter8_1dfunction aom_filter_block1d8_h2_sse2; +filter8_1dfunction aom_filter_block1d4_v2_sse2; +filter8_1dfunction aom_filter_block1d4_h2_sse2; + +// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); + +#if ARCH_X86_64 +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); + +#endif // ARCH_X86_64 +#endif // HAVE_SSE2 diff --git a/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm new file mode 100644 index 000000000..7283c32b8 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -0,0 +1,297 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl srcq, 1 + shl src_strideq, 1 + shl dstq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + + cmp r4d, 64 + je .w64 +%ifidn %2, highbd + cmp r4d, 128 + je .w128 + +.w256: + mov r4d, dword hm +.loop256: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + movu m0, [srcq+128] + movu m1, [srcq+128+16] + movu m2, [srcq+128+32] + movu m3, [srcq+128+48] +%ifidn %1, avg + pavg m0, [dstq+128] + pavg m1, [dstq+128+16] + pavg m2, [dstq+128+32] + pavg m3, [dstq+128+48] +%endif + mova [dstq+128 ], m0 + mova [dstq+128+16], m1 + mova [dstq+128+32], m2 + mova [dstq+128+48], m3 + movu m0, [srcq+128+64] + movu m1, [srcq+128+80] + movu m2, [srcq+128+96] + movu m3, [srcq+128+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+128+64] + pavg m1, [dstq+128+80] + pavg m2, [dstq+128+96] + pavg m3, [dstq+128+112] +%endif + mova [dstq+128+64], m0 + mova [dstq+128+80], m1 + mova [dstq+128+96], m2 + mova [dstq+128+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop256 + RET +%endif + +.w128: + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET + +.w64: + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +convolve_fn copy, highbd diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm new file mode 100644 index 000000000..b6f040791 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -0,0 +1,613 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..7b3fe6419 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,338 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +SECTION .text + +global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..94b5da171 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -0,0 +1,1441 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_ports/mem.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((uint32_t *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + +static void aom_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 +#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 +#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 +#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 +#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 +#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 +#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 +// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..325a21b76 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; + +void aom_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr += src_pixels_per_line; + + // save only 4 bytes + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += out_pitch; + } +} + +filter8_1dfunction aom_filter_block1d16_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; + +#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 +#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 +#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 +#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 + +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; + +// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm new file mode 100644 index 000000000..c88fc9ffb --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -0,0 +1,615 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_v8_sse2) PRIVATE +sym(aom_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_v8_sse2) PRIVATE +sym(aom_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_v8_sse2) PRIVATE +sym(aom_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_h8_sse2) PRIVATE +sym(aom_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_h8_sse2) PRIVATE +sym(aom_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_h8_sse2) PRIVATE +sym(aom_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..3ca7921b6 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -0,0 +1,870 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffav1) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 +%ifidn %1, h8_add_src +%if ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif + movu m5, [srcq] + mova m7, m5 + pand m5, m2 + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER4 h8 + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8, 4 diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..d0b4b2839 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +global sym(aom_filter_block1d4_v2_sse2) PRIVATE +sym(aom_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_sse2) PRIVATE +sym(aom_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_sse2) PRIVATE +sym(aom_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_sse2) PRIVATE +sym(aom_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_sse2) PRIVATE +sym(aom_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_sse2) PRIVATE +sym(aom_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..59edc49a9 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -0,0 +1,267 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +global sym(aom_filter_block1d4_v2_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 000000000..4f5e3f8c1 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" + +#include "config/aom_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, w, h, 0, 0); +} + +void aom_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, w, h, 0, 0, + bd); +} diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 000000000..67fb4d32b --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> // SSE4.1 +#include <immintrin.h> // AVX2 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + if (subx & suby) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subx) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (suby) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c new file mode 100644 index 000000000..9d6b4c2f7 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c @@ -0,0 +1,1109 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> // SSE4.1 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = xx_loadu_128(mask + c); + const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + typedef void (*blend_fn)( + uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } +} + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b10); +} + +static void blend_a64_mask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = xx_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = xx_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadl_64(mask + c); + const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, + int subx, int suby, int bd) { + typedef void (*blend_fn)( + uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: bd_index X width_index X subx X suby + static const blend_fn blend[2][2][2][2] = { + { // bd == 8 or 10 + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, + { // bd == 12 + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, w, h, subx, + suby, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, w, h); + } +} + +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } +} diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 000000000..064910232 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> // SSE4.1 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + aom_blend_a64_vmask_c, // w == 1 + aom_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, + h); +} + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b12); +} + +static INLINE void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, w, h, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h); + } +} diff --git a/media/libaom/src/aom_dsp/x86/blend_mask_sse4.h b/media/libaom/src/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 000000000..c071fdcfc --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include <smmintrin.h> // SSE4.1 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/media/libaom/src/aom_dsp/x86/blend_sse4.h b/media/libaom/src/aom_dsp/x86/blend_sse4.h new file mode 100644 index 000000000..8d9b32510 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/blend_sse4.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/media/libaom/src/aom_dsp/x86/common_avx2.h b/media/libaom/src/aom_dsp/x86/common_avx2.h new file mode 100644 index 000000000..96fe4ebb6 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve.h b/media/libaom/src/aom_dsp/x86/convolve.h new file mode 100644 index 000000000..3e19682cd --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/convolve.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ + +#include <assert.h> + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + aom_highbd_convolve8_##name##_c( \ + CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } + +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve_avx2.h b/media/libaom/src/aom_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..30253f65c --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/convolve_avx2.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ + +// filters for 16 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + +static INLINE void prepare_coeffs_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16(0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m256i convolve_lowbd(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), + _mm256_add_epi16(res_23, res_67)); + + return res; +} + +static INLINE __m256i convolve(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m256i convolve_lowbd_x(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + s[3] = _mm256_shuffle_epi8(data, filt[3]); + + return convolve_lowbd(s, coeffs); +} + +static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, + const __m256i *const res, + const int do_average) { + __m256i d; + if (do_average) { + d = _mm256_load_si256((__m256i *)dst); + d = _mm256_add_epi32(d, *res); + d = _mm256_srai_epi32(d, 1); + } else { + d = *res; + } + _mm256_store_si256((__m256i *)dst, d); +} + +static INLINE __m256i comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); + const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); + + const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm256_packs_epi32(res_lo, res_hi); + } else { + const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, + const __m256i *const offset_const, + const __m256i *const round_const, + const int round_shift) { + const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi16( + _mm256_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt0, + const __m256i *const wt1, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); + const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); + const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); + res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi32(wt_res, 1); + } + return res; +} + +static INLINE __m256i highbd_convolve_rounding( + const __m256i *const res_unsigned, const __m256i *const offset_const, + const __m256i *const round_const, const int round_shift) { + const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi32( + _mm256_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h b/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h new file mode 100644 index 000000000..707bd2d78 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(d, *res); + d = _mm_srai_epi32(d, 1); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse2.h b/media/libaom/src/aom_dsp/x86/convolve_sse2.h new file mode 100644 index 000000000..445d04b10 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/convolve_sse2.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeff = _mm_loadu_si128((__m128i *)filter); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); + + const __m128i res = + _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i comp_avg(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); + const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); + + const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm_packs_epi32(res_lo, res_hi); + } else { + const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); + res = _mm_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, + const __m128i *const offset_const, + const __m128i *const round_const, + const int round_shift) { + const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m128i highbd_convolve_rounding_sse2( + const __m128i *const res_unsigned, const __m128i *const offset_const, + const __m128i *const round_const, const int round_shift) { + const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h new file mode 100644 index 000000000..6b8388d84 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, + const __m128i *const res, + const __m128i *const wt0, + const __m128i *const wt1, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); + d = _mm_srai_epi32(d, DIST_PRECISION_BITS); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt0, + const __m128i *const wt1, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); + const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); + + const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); + res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); + res = _mm_srai_epi32(wt_res, 1); + } + return res; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/media/libaom/src/aom_dsp/x86/fft_avx2.c b/media/libaom/src/aom_dsp/x86/fft_avx2.c new file mode 100644 index 000000000..54da02253 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fft_avx2.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +extern void aom_transpose_float_sse2(const float *A, float *B, int n); +extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, + int n); + +// Generate the 1d forward transforms for float using _mm256 +GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +// Generate the 1d inverse transforms for float using _mm256 +GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, + aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); +} + +void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_avx2, aom_ifft1d_16_avx2, + aom_transpose_float_sse2, 8); +} + +void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_avx2, aom_ifft1d_32_avx2, + aom_transpose_float_sse2, 8); +} diff --git a/media/libaom/src/aom_dsp/x86/fft_sse2.c b/media/libaom/src/aom_dsp/x86/fft_sse2.c new file mode 100644 index 000000000..12bdc3e18 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fft_sse2.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the +s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <xmmintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void transpose4x4(const float *A, float *B, const int lda, + const int ldb) { + __m128 row1 = _mm_load_ps(&A[0 * lda]); + __m128 row2 = _mm_load_ps(&A[1 * lda]); + __m128 row3 = _mm_load_ps(&A[2 * lda]); + __m128 row4 = _mm_load_ps(&A[3 * lda]); + _MM_TRANSPOSE4_PS(row1, row2, row3, row4); + _mm_store_ps(&B[0 * ldb], row1); + _mm_store_ps(&B[1 * ldb], row2); + _mm_store_ps(&B[2 * ldb], row3); + _mm_store_ps(&B[3 * ldb], row4); +} + +void aom_transpose_float_sse2(const float *A, float *B, int n) { + for (int y = 0; y < n; y += 4) { + for (int x = 0; x < n; x += 4) { + transpose4x4(A + y * n + x, B + x * n + y, n, n); + } + } +} + +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { + const int n2 = n / 2; + output[0] = packed[0]; + output[1] = 0; + output[2 * (n2 * n)] = packed[n2 * n]; + output[2 * (n2 * n) + 1] = 0; + + output[2 * n2] = packed[n2]; + output[2 * n2 + 1] = 0; + output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; + output[2 * (n2 * n + n2) + 1] = 0; + + for (int c = 1; c < n2; ++c) { + output[2 * (0 * n + c)] = packed[c]; + output[2 * (0 * n + c) + 1] = packed[c + n2]; + output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; + output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; + } + for (int r = 1; r < n2; ++r) { + output[2 * (r * n + 0)] = packed[r * n]; + output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; + output[2 * (r * n + n2) + 0] = packed[r * n + n2]; + output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; + + for (int c = 1; c < AOMMIN(n2, 4); ++c) { + output[2 * (r * n + c)] = + packed[r * n + c] - packed[(r + n2) * n + c + n2]; + output[2 * (r * n + c) + 1] = + packed[(r + n2) * n + c] + packed[r * n + c + n2]; + } + + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r * n + c); + __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); + real1 = _mm_sub_ps(real1, real2); + imag1 = _mm_add_ps(imag1, imag2); + _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); + } + + int r2 = r + n2; + int r3 = n - r2; + output[2 * (r2 * n + 0)] = packed[r3 * n]; + output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; + output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; + output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; + for (int c = 1; c < AOMMIN(4, n2); ++c) { + output[2 * (r2 * n + c)] = + packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; + output[2 * (r2 * n + c) + 1] = + -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; + } + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r3 * n + c); + __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); + real1 = _mm_add_ps(real1, real2); + imag1 = _mm_sub_ps(imag2, imag1); + _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r2 * n + c + 2), + _mm_unpackhi_ps(real1, imag1)); + } + } +} + +// Generate definitions for 1d transforms using float and __mm128 +GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +// Generate definitions for 1d inverse transforms using float and mm128 +GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, + aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, + aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_sse2, aom_ifft1d_16_sse2, + aom_transpose_float_sse2, 4); +} + +void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_sse2, aom_ifft1d_32_sse2, + aom_transpose_float_sse2, 4); +} diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 000000000..1e3d13ec8 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 000000000..2d8f8f71e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" + +void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT8x8_2D aom_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT8x8_2D + +#undef DCT_HIGH_BIT_DEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT8x8_2D diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 000000000..260d8dd58 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_store_si128((__m128i *)(dst_ptr), *poutput); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000..c1fb259a1 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,379 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +%macro STORE_OUTPUT 2 ; index, result + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%endmacro + +SECTION .text + +%if ARCH_X86_64 +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + STORE_OUTPUT 0, 4 + STORE_OUTPUT 8, 5 + STORE_OUTPUT 16, 3 + STORE_OUTPUT 24, 0 + STORE_OUTPUT 32, 10 + STORE_OUTPUT 40, 9 + STORE_OUTPUT 48, 7 + STORE_OUTPUT 56, 2 + + RET +%endif diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 000000000..099fcf7fc --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,998 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> +#include <string.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m256i s[8], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[4], coeffs_x[4]; + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else { + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } + } + } +} + +#define CONV8_ROUNDING_BITS (7) + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 +#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 +#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 +#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +#undef HIGHBD_FUNC diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c new file mode 100644 index 000000000..e7b33d1c4 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> +#include <assert.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m128i s[16], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((uint32_t *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m128i s[4], coeffs_x[4]; + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 000000000..5a55736c4 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm new file mode 100644 index 000000000..91b3d126c --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm @@ -0,0 +1,259 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET diff --git a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 000000000..c954da94e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +void aom_highbd_lpf_horizontal_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, + blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 000000000..097e0778f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1697 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/lpf_common_sse2.h" + +static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + *pixel = _mm_min_epi16(*pixel, *max); + *pixel = _mm_max_epi16(*pixel, *min); +} + +static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); +} + +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr, __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void get_limit_dual( + const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, + const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, + int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, + __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); + __m128i x1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *blt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *lt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *thr_out = _mm_slli_epi16(x0, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); + } +} + +static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = abs_diff16(p[0], q[0]); + __m128i abs_p1q1 = abs_diff16(p[1], q[1]); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, + __m128i *p1p0, __m128i *q1q0, + __m128i *abs_p1p0, __m128i *l, + __m128i *bl, __m128i *t, + __m128i *hev, __m128i *mask) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; + __m128i max, max01, h; + + *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); + *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); + + abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 + + max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + *abs_p1p0 = abs_diff16(pq[0], pq[1]); + abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); + max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); + // mask |= (abs(*p1 - *p0) > limit) * -1; + // mask |= (abs(*q1 - *q0) > limit) * -1; + h = _mm_subs_epu16(max01, *t); + + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + // replicate for the further "merged variables" usage + *hev = _mm_unpacklo_epi64(*hev, *hev); + + max = _mm_max_epi16(max, max01); + int i; + for (i = 2; i < x; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, + int start, int end, __m128i *flat) { + int i; + __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), + abs_diff16(pq[start + 1], pq[0])); + + for (i = start + 2; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, + const __m128i *q, int start, int end, + __m128i *flat) { + int i; + __m128i max = + _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); + + for (i = start + 1; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); + } + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal(&th, pq, 1, 4, flat); + flat_mask_internal(&th, pq, 4, 7, flat2); +} + +static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, + const __m128i *q, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal_dual(&th, p, q, 1, 4, flat); + flat_mask_internal_dual(&th, p, q, 4, 7, flat2); +} + +static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0, __m128i *t80, + int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + + const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); + __m128i ps1ps0_work, qs1qs0_work, work; + __m128i filt, filter2filter1, filter2filt, filter1filt; + + ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); + qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); + + work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); + pixel_clamp(&pmin, &pmax, &work); + filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, *mask); + filt = _mm_unpacklo_epi64(filt, filt); + + filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ + pixel_clamp(&pmin, &pmax, &filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ + + filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); + + // filt >> 1 + filt = _mm_adds_epi16(filt, one); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(*hev, filt); + + filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); + filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); + + qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); + ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); + + pixel_clamp(&pmin, &pmax, &qs1qs0_work); + pixel_clamp(&pmin, &pmax, &ps1ps0_work); + + *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); + *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); +} + +static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, + __m128i *qs, const __m128i *mask, + const __m128i *th, int bd, + __m128i *t80) { + __m128i ps0 = _mm_subs_epi16(p[0], *t80); + __m128i ps1 = _mm_subs_epi16(p[1], *t80); + __m128i qs0 = _mm_subs_epi16(q[0], *t80); + __m128i qs1 = _mm_subs_epi16(q[1], *t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + // hev_filter + __m128i hev; + const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); + const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *th); + const __m128i ffff = _mm_cmpeq_epi16(h, h); + hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + + filter = _mm_and_si128(filter, hev); + + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + qs[0] = _mm_adds_epi16(qs0, *t80); + ps[0] = _mm_adds_epi16(ps0, *t80); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + qs[1] = _mm_adds_epi16(qs1, *t80); + ps[1] = _mm_adds_epi16(ps1, *t80); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( + __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, + const unsigned char *lt, const unsigned char *thr, int bd) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i t80; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); + + for (i = 0; i < 7; i++) { + pq[i] = _mm_unpacklo_epi64(p[i], q[i]); + } + __m128i mask, hevhev; + __m128i p1p0, q1q0, abs_p1p0; + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hevhev, &mask); + + __m128i ps0ps1, qs0qs1; + // filter4 + highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); + + __m128i flat, flat2; + highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); + __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); + + sum_p6 = _mm_add_epi16(pq[6], pq[6]); + sum_p3 = _mm_add_epi16(pq[3], pq[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + + work0 = _mm_add_epi16(sum_p3, pq[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq[3]); + work0 = _mm_add_epi16(sum_p3, pq[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + } +} + +void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + __m128i p[7], q[7], pq[7]; + int i; + + for (i = 0; i < 7; i++) { + p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); + } + + highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); + + for (i = 0; i < 6; i++) { + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( + __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, + const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, + const uint8_t *thr1, int bd) { + __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, + &t80); + __m128i mask; + highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); + __m128i flat, flat2; + highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + __m128i ps[2], qs[2]; + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); + __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; + } +} + +void aom_highbd_lpf_horizontal_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p[7], q[7]; + int i; + load_highbd_pixel(s, 7, pitch, p, q); + + highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + for (i = 0; i < 6; i++) { + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, + const uint8_t *_limit, const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[3]; + __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; + __m128i flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, + const unsigned char *_thresh0, const unsigned char *_blimit1, + const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat, work; + __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; + __m128i op1, op0, oq0, oq1; + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p2p1 = abs_diff16(*p2, *p1); + abs_p1p0 = abs_diff16(*p1, *p0); + abs_q1q0 = abs_diff16(*q1, *q0); + abs_q2q1 = abs_diff16(*q2, *q1); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + mask = _mm_max_epi16(abs_q2q1, mask); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + mask = _mm_max_epi16(work, mask); + mask = _mm_max_epi16(mask, abs_p2p1); + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + // flat_mask + flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); + flat = _mm_max_epi16(flat, work); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + op1 = _mm_srli_epi16(workp_shft0, 3); + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + op0 = _mm_srli_epi16(workp_a, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + oq0 = _mm_srli_epi16(workp_shft0, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + oq1 = _mm_srli_epi16(workp_shft1, 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, + _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); +} + +void aom_highbd_lpf_horizontal_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[4]; + __m128i p1p0, q1q0, ps1ps0, qs1qs0; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + pq[3] = _mm_unpacklo_epi64(*p3, *q3); + + __m128i abs_p1p0; + + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask4 + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, + const unsigned char *_limit0, const unsigned char *_thresh0, + const unsigned char *_blimit1, const unsigned char *_limit1, + const unsigned char *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat; + __m128i work_a, op2, oq2, op1, op0, oq0, oq1; + __m128i abs_p1q1, abs_p0q0, work0, work1, work2; + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; + + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); + work1 = + _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat + work0 = _mm_max_epi16(work0, work1); + work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); + work2 = _mm_max_epi16(work2, work0); + mask = _mm_max_epi16(work2, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); + flat = _mm_max_epi16(work1, flat); + work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); + flat = _mm_max_epi16(work0, flat); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, + &p1p0, _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, + _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, + __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev; + __m128i p1p0, q1q0; + __m128i pq[2]; + + __m128i abs_p1p0; + + __m128i t80; + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + + highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, + __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i mask, flat; + __m128i p[2], q[2]; + + const __m128i zero = _mm_setzero_si128(); + __m128i abs_p0q0 = abs_diff16(*q0, *p0); + __m128i abs_p1q1 = abs_diff16(*q1, *p1); + + __m128i abs_p1p0 = abs_diff16(*p1, *p0); + __m128i abs_q1q0 = abs_diff16(*q1, *q0); + + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + + __m128i t80; + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + mask = _mm_max_epi16(flat, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); +} + +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p1p0, q1q0; + __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, + _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +void aom_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i ps[2], qs[2]; + + highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, + _thresh0, _blimit1, _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); + _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); +} + +void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i x0, x1, x2, x3, d0, d1, d2, d3; + __m128i p1p0, q1q0; + __m128i p1, q1; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); + + highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, + thresh, bd); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + // transpose from 8x4 to 4x8 + highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i ps[2], qs[2]; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); + + highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3); + + highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, + &d3, &d4, &d5, &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x3, x2, x1, x0, p0, q0; + __m128i p1p0, q1q0; + + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, + limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0, p1, q1, p2, q2; + + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, + &p0, &q0, &q1, &q2, &d6, &d7); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p2, p1, p0, p3, q0; + __m128i q1q0, p1p0; + + p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); + p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); + p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); + p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + // Loop filtering + highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, + &p1p0, blimit, limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, + &d1, &d2, &d3); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3, &d4, &d5, &d6, &d7); + + highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, + blimit0, limit0, thresh0, blimit1, limit1, + thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, + &x2, &x3, &x4, &x5, &x6, &x7); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); + _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); + _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); + _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); + _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); +} + +void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i q[7], p[7], pq[7]; + __m128i p6, p5, p4, p3; + __m128i p6_2, p5_2, p4_2, p3_2; + __m128i d0, d1, d2, d3; + __m128i d0_2, d1_2, d2_2, d3_2, d7_2; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], + &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], + &q[3], &q[4], &q[5], &q[6], &d7_2); + + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); + + highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], + &pq[1], &pq[0], &d0, &d1, &d2, &d3); + + q[0] = _mm_srli_si128(pq[0], 8); + q[1] = _mm_srli_si128(pq[1], 8); + q[2] = _mm_srli_si128(pq[2], 8); + q[3] = _mm_srli_si128(pq[3], 8); + q[4] = _mm_srli_si128(pq[4], 8); + q[5] = _mm_srli_si128(pq[5], 8); + + highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], + &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); +} + +void aom_highbd_lpf_vertical_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i q[7], p[7]; + __m128i p6, p5, p4, p3, p2, p1, p0, q0; + __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; + __m128i d0, d7; + __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); + p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); + p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); + q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], + &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, + &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], + &q[6], &d7); + + highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); + + highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 000000000..b9689202a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, __m256i *qp) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y, + __m256i *p) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, 16); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_or_si256(prod_lo, prod_hi); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, tran_low_t *qcoeff, + tran_low_t *dqcoeff, __m256i *eob) { + const __m256i abs = _mm256_abs_epi32(*c); + const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]); + __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]); + flag2 = _mm256_or_si256(flag1, flag2); + const int32_t nzflag = _mm256_movemask_epi8(flag2); + + if (LIKELY(nzflag)) { + __m256i q = _mm256_add_epi32(abs, qp[1]); + __m256i tmp; + mm256_mul_shift_epi32(&q, &qp[2], &tmp); + q = _mm256_add_epi32(tmp, q); + + mm256_mul_shift_epi32(&q, &qp[4], &q); + __m256i dq = _mm256_mullo_epi32(q, qp[3]); + + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + q = _mm256_and_si256(q, flag2); + dq = _mm256_and_si256(dq, flag2); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); + } else { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + } +} + +void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const unsigned int step = 8; + + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 000000000..58e5f98e5 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + *eob_ptr = eob_i + 1; +} + +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 000000000..e0d22522d --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,296 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 +HIGH_SADNXN4D 4, 16 +HIGH_SADNXN4D 16, 4 +HIGH_SADNXN4D 8, 32 +HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 000000000..3398d8a2a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,374 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 + +; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 +HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 + +; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 +HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 +HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 + +; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 +HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 diff --git a/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 000000000..61f5b8e86 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1036 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd eax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%if ARCH_X86_64 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define block_height heightd + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl sec_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c new file mode 100644 index 000000000..18eb03d12 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include <stddef.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); + +static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); +} + +static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); + store_diff = (int64_t *)(diff + 4 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x4); + store_diff = (int64_t *)(diff + 5 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x5); + store_diff = (int64_t *)(diff + 6 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x6); + store_diff = (int64_t *)(diff + 7 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x7); +} + +static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); +} + +static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); +} + +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } + +static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { + if (rows == 4) { + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; + if (cols == 128) return subtract_128x64; + } + if (rows == 128) { + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; + } + assert(0); + return NULL; +} + +void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + SubtractWxHFuncType func; + (void)bd; + + func = getSubtractFunc(rows, cols); + func(diff, diff_stride, src, src_stride, pred, pred_stride); +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 000000000..9b1b4c9de --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "config/aom_dsp_rtcd.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..0d954e178 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,318 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +SECTION .text + +;unsigned int aom_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc16x16var_sse2) PRIVATE +sym(aom_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int aom_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc8x8var_sse2) PRIVATE +sym(aom_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..47b052abc --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,868 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "aom_ports/mem.h" + +#include "av1/common/filter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define HIGH_GET_VAR(S) \ + void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN + +unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) + +FNS(sse2); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt) \ + DECL(16, opt) \ + DECL(8, opt) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + return; + } + } + + const InterpFilterParams *filter = + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + + if (!subpel_x_q3 && !subpel_y_q3) { + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width >= 8) { + int i; + assert(!(width & 7)); + /*Read 8 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + int i; + assert(!(width & 3)); + /*Read 4 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); + comp_pred += 8; + ref += 2 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ + assert(!(width * height & 7)); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; + pred += 8; + } +} + +static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + if (width >= 8) { + // Read 8 pixels one row at a time + assert(!(width & 7)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 8) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + // Read 4 pixels two rows at a time + assert(!(width & 3)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 2 * ref_stride; + } + } +} + +void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + assert(!(width * height & 7)); + n = width * height >> 3; + + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred16); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + + comp_pred16 += 8; + pred += 8; + } +} diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_sse4.c b/media/libaom/src/aom_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 000000000..df5449a9d --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); + a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); + a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); + a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); + + b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); + b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); + b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); + b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return diff >= 0 ? (uint32_t)diff : 0; +} + +// Sub-pixel +uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +// Sub-pixel average + +uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} diff --git a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 000000000..1e67d392e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,811 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_64(const uint8_t *ref) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i zero = _mm256_setzero_si256(); + __m256i y0 = _mm256_sad_epu8(x0, zero); + __m256i y1 = _mm256_sad_epu8(x1, zero); + y0 = _mm256_add_epi64(y0, y1); + __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); + y0 = _mm256_add_epi64(u0, y0); + u0 = _mm256_unpackhi_epi64(y0, y0); + return _mm256_add_epi16(y0, u0); +} + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, + int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r0); + _mm256_storeu_si256((__m256i *)(dst + 32), *r1); + dst += stride; + } +} + +static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + _mm256_storeu_si256((__m256i *)(dst + 32), *r); + dst += stride; + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle + +// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. +// Use a header file, intrapred_common_x86.h +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint32_t sum = _mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 64; + sum /= 128; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 40; + sum /= 80; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 64, dst, stride); +} + +void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 64, dst, stride); +} + +void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 32, dst, stride); +} + +void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + for (int j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 2; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i; + const __m256i l = get_left_vector(left); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2.c b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 000000000..5b2452c8e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,1430 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_16(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); + dst += stride; + } +} + +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { + int i = 0; + do { + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); + dst += stride << 2; + + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm new file mode 100644 index 000000000..9aece27be --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm @@ -0,0 +1,625 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET diff --git a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 000000000..807ed1770 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,1692 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + const __m128i l = _mm_loadl_epi64((const __m128i *)left); + __m128i l16; + + for (int i = 0; i < 8; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16(0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + const __m128i l = _mm_load_si128((const __m128i *)left); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + if (height == 4) + pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[1] = _mm_loadu_si128(((const __m128i *)left)); + + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weight_h[0] = _mm_unpacklo_epi8(t, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + + if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, ww[0]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 4, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 8, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 16, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); + + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[2] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[2] = _mm_load_si128((const __m128i *)left); + } else { + pixels[2] = _mm_load_si128((const __m128i *)left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[7] = pixels[3]; + } +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 4 : 8; + __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else if (height == 32) { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[8]; + load_pixel_w8(above, left, 32, pixels); + + __m128i wh[8], ww[2]; + load_weight_w8(sm_weight_arrays, 32, wh, ww); + + smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i top_right = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); + pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); + const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); + + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + const __m128i scale_m_weights_x = + _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); + const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); + const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); + const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); + + pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); + pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); + + pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); + pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); + + pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); + pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +// pixels[0]: above and below_pred interleave vector +static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vector +static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 4) { + const __m128i weight = + _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(d, weights[2]); + } +} + +static INLINE void smooth_v_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i sum = _mm_madd_epi16(pixel[0], wh_sc); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 4, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 4, weights); + + smooth_v_pred_4xh(&pixels, weights, 4, dst, stride); +} + +void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 8, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 8, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); +} + +void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 16, &pixels); + + __m128i weights[4]; + load_weight_v_w4(sm_weight_arrays, 16, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height, + __m128i *weight_h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height < 16) { + const int offset = height < 8 ? 4 : 8; + const __m128i weight = + _mm_loadu_si128((const __m128i *)&weight_array[offset]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + s0 = _mm_add_epi32(s0, pred_round); + s0 = _mm_srai_epi32(s0, sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, pred_round); + s1 = _mm_srai_epi32(s1, sm_weight_log2_scale); + + __m128i sum01 = _mm_packus_epi16(s0, s1); + sum01 = _mm_shuffle_epi8(sum01, gat); + _mm_storel_epi64((__m128i *)dst, sum01); + dst += stride; + + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 4, pixels); + + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 4, wh); + + smooth_v_pred_8xh(pixels, wh, 4, dst, stride); +} + +void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 8, pixels); + + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 8, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); +} + +void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 16, pixels); + + __m128i wh[4]; + load_weight_v_w8(sm_weight_arrays, 16, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); +} + +void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 32, pixels); + + __m128i wh[8]; + load_weight_v_w8(sm_weight_arrays, 32, wh); + + smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride); +} + +static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i bottom_left = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = + _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i scale_m_weights_y = + _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + // 8 -> 16 + const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero); + const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y); + const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y); + // top_x * weights_y + scale_m_weights_y * bottom_left + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + pred_lo = _mm_add_epi32(pred_lo, round); + pred_hi = _mm_add_epi32(pred_hi, round); + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED + +// pixels[0]: left vector +// pixels[1]: right_pred vector +static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + if (height == 4) + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[0] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[0] = _mm_loadu_si128(((const __m128i *)left)); + pixels[1] = _mm_set1_epi16((uint16_t)above[3]); +} + +// weights[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + (void)height; + const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + const __m128i zero = _mm_setzero_si128(); + + const __m128i weights_0 = _mm_unpacklo_epi8(t, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i weights_1 = _mm_sub_epi16(d, weights_0); + weights[0] = _mm_unpacklo_epi16(weights_0, weights_1); +} + +static INLINE void smooth_h_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); + + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixel[0], rep); + b = _mm_unpacklo_epi16(b, pixel[1]); + __m128i sum = _mm_madd_epi16(b, weight[0]); + + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + } +} + +void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 4, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 4, &weights); + + smooth_h_pred_4xh(pixels, &weights, 4, dst, stride); +} + +void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 8, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 16, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); + dst += stride << 3; + + pixels[0] = _mm_srli_si128(pixels[0], 8); + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +// pixels[0]: left vector +// pixels[1]: right_pred vector +// pixels[2]: left vector + 16 +// pixels[3]: right_pred vector +static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + pixels[1] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[0] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[0] = _mm_load_si128((const __m128i *)left); + } else { + pixels[0] = _mm_load_si128((const __m128i *)left); + pixels[2] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[3] = pixels[1]; + } +} + +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height, + __m128i *weight_w) { + (void)height; + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]); + const __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + const __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); +} + +static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, + int h, uint8_t *dst, ptrdiff_t stride, + int second_half) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixels[0], rep); + b = _mm_unpacklo_epi16(b, pixels[1]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + sum0 = _mm_add_epi32(sum0, pred_round); + sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale); + + sum1 = _mm_add_epi32(sum1, pred_round); + sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(sum0, sum1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + } +} + +void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 4, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 4, ww); + + smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 8, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 8, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 16, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 16, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1); +} + +void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_h_w8(above, left, 32, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 32, ww); + + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i tr_ly = + _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero); + const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw); + const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw); + const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw); + __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly); + __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly); + + pred_lo = _mm_add_epi32(pred_lo, pred_round); + pred_hi = _mm_add_epi32(pred_hi, pred_round); + + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 64); +} + +void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 16); +} diff --git a/media/libaom/src/aom_dsp/x86/inv_wht_sse2.asm b/media/libaom/src/aom_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 000000000..0bc841a7a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,107 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c new file mode 100644 index 000000000..c3c88245a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 4); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 4) { + __m128i x0 = xx_loadl_32(a + 0 * a_stride); + __m128i x1 = xx_loadl_32(a + 1 * a_stride); + __m128i x2 = xx_loadl_32(a + 2 * a_stride); + __m128i x3 = xx_loadl_32(a + 3 * a_stride); + __m128i x_lo = _mm_unpacklo_epi32(x0, x1); + __m128i x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); + + x0 = xx_loadl_32(b + 0 * b_stride); + x1 = xx_loadl_32(b + 1 * b_stride); + x2 = xx_loadl_32(b + 2 * b_stride); + x3 = xx_loadl_32(b + 3 * b_stride); + x_lo = _mm_unpacklo_epi32(x0, x1); + x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); + + __m128i sad4x4 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad4x4); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 8); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 2) { + __m128i x0 = xx_loadl_64(a + 0 * a_stride); + __m128i x1 = xx_loadl_64(a + 1 * a_stride); + + __m128i x = _mm_unpacklo_epi64(x0, x1); + + x0 = xx_loadl_64(b + 0 * b_stride); + x1 = xx_loadl_64(b + 1 * b_stride); + + __m128i y = _mm_unpacklo_epi64(x0, x1); + + __m128i sad8x2 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad8x2); + + a += 2 * a_stride; + b += 2 * b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 16); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + __m128i x = xx_loadu_128(a); + __m128i y = xx_loadu_128(b); + + __m128i sad16x1 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad16x1); + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 32); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 2; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad32_half = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad32_half); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 64); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 4; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 128); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 8; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +#define jnt_sadMxN_sse2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ + } + +#define jnt_sadMxN_avx2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ + } + +/* clang-format off */ +jnt_sadMxN_sse2(128, 128) +jnt_sadMxN_sse2(128, 64) +jnt_sadMxN_sse2(64, 128) +jnt_sadMxN_sse2(64, 64) +jnt_sadMxN_sse2(64, 32) +jnt_sadMxN_sse2(32, 64) +jnt_sadMxN_sse2(32, 32) +jnt_sadMxN_sse2(32, 16) +jnt_sadMxN_sse2(16, 32) +jnt_sadMxN_sse2(16, 16) +jnt_sadMxN_sse2(16, 8) +jnt_sadMxN_sse2(8, 16) +jnt_sadMxN_sse2(8, 8) +jnt_sadMxN_sse2(8, 4) +jnt_sadMxN_sse2(4, 8) +jnt_sadMxN_sse2(4, 4) +jnt_sadMxN_sse2(4, 16) +jnt_sadMxN_sse2(16, 4) +jnt_sadMxN_sse2(8, 32) +jnt_sadMxN_sse2(32, 8) +jnt_sadMxN_sse2(16, 64) +jnt_sadMxN_sse2(64, 16) + /* clang-format on */ diff --git a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c new file mode 100644 index 000000000..f9a41a210 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + if (width >= 16) { + // Read 16 pixels one row at a time + assert(!(width & 15)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 16) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + // Read 8 pixels two row at a time + assert(!(width & 7)); + assert(!(width & 1)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 2 * ref_stride; + } + } else { + // Read 4 pixels four row at a time + assert(!(width & 3)); + assert(!(height & 3)); + for (i = 0; i < height; i += 4) { + const uint8_t *row0 = ref + 0 * ref_stride; + const uint8_t *row1 = ref + 1 * ref_stride; + const uint8_t *row2 = ref + 2 * ref_stride; + const uint8_t *row3 = ref + 3 * ref_stride; + + __m128i p0 = + _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], + row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], + row3[0], row3[1], row3[2], row3[3]); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 4 * ref_stride; + } + } +} + +void aom_jnt_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} + +#define JNT_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +JNT_SUBPIX_AVG_VAR(128, 128) +JNT_SUBPIX_AVG_VAR(128, 64) +JNT_SUBPIX_AVG_VAR(64, 128) +JNT_SUBPIX_AVG_VAR(64, 64) +JNT_SUBPIX_AVG_VAR(64, 32) +JNT_SUBPIX_AVG_VAR(32, 64) +JNT_SUBPIX_AVG_VAR(32, 32) +JNT_SUBPIX_AVG_VAR(32, 16) +JNT_SUBPIX_AVG_VAR(16, 32) +JNT_SUBPIX_AVG_VAR(16, 16) +JNT_SUBPIX_AVG_VAR(16, 8) +JNT_SUBPIX_AVG_VAR(8, 16) +JNT_SUBPIX_AVG_VAR(8, 8) +JNT_SUBPIX_AVG_VAR(8, 4) +JNT_SUBPIX_AVG_VAR(4, 8) +JNT_SUBPIX_AVG_VAR(4, 4) +JNT_SUBPIX_AVG_VAR(4, 16) +JNT_SUBPIX_AVG_VAR(16, 4) +JNT_SUBPIX_AVG_VAR(8, 32) +JNT_SUBPIX_AVG_VAR(32, 8) +JNT_SUBPIX_AVG_VAR(16, 64) +JNT_SUBPIX_AVG_VAR(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c new file mode 100644 index 000000000..9d88b5e49 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,2385 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontaly. Used +// for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + +static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { + const __m128i t3t4 = + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi64(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + + hev1 = _mm_unpackhi_epi64(filter2filter1, filter); + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + /* (abs(q1 - q0), abs(p1 - p0) */ + __m128i flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + + /* const int8_t mask = filter_mask2(*limit, *blimit, */ + /* p1, p0, q0, q1); */ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi64(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); + + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i qs1qs0, ps1ps0; + __m128i p1, p0, q0, q1; + + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); + + xx_storel_32(s - 1 * p, ps1ps0); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); + xx_storel_32(s + 0 * p, qs1qs0); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); +} + +void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + __m128i p1p0, q1q0; + __m128i p1, p0, q0, q1; + + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i x0, x1, x2, x3; + __m128i d0, d1, d2, d3; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); + + // Transpose 8x4 to 4x8 + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { + xx_storel_32(s - (num + 1) * p, x); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); +} + +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); + q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + __m128i fe, ff, work; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); + qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); + // loopfilter done + + __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p6, sum_q6; + __m128i sum_p3, sum_q3, res_p, res_q; + + p6_16 = _mm_unpacklo_epi8(*q6p6, zero); + p5_16 = _mm_unpacklo_epi8(*q5p5, zero); + p4_16 = _mm_unpacklo_epi8(*q4p4, zero); + p3_16 = _mm_unpacklo_epi8(*q3p3, zero); + p2_16 = _mm_unpacklo_epi8(*q2p2, zero); + p1_16 = _mm_unpacklo_epi8(*q1p1, zero); + p0_16 = _mm_unpacklo_epi8(*q0p0, zero); + q0_16 = _mm_unpackhi_epi8(*q0p0, zero); + q1_16 = _mm_unpackhi_epi8(*q1p1, zero); + q2_16 = _mm_unpackhi_epi8(*q2p2, zero); + q3_16 = _mm_unpackhi_epi8(*q3p3, zero); + q4_16 = _mm_unpackhi_epi8(*q4p4, zero); + q5_16 = _mm_unpackhi_epi8(*q5p5, zero); + q6_16 = _mm_unpackhi_epi8(*q6p6, zero); + pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), + _mm_add_epi16(p1_16, q0_16))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), + _mm_add_epi16(p0_16, q1_16))), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(p6_16, p6_16); + sum_q6 = _mm_add_epi16(q6_16, q6_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done + + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + _mm_cvtsi32_si128(*(int *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + _mm_cvtsi32_si128(*(int *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + _mm_cvtsi32_si128(*(int *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + _mm_cvtsi32_si128(*(int *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + _mm_cvtsi32_si128(*(int *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + _mm_cvtsi32_si128(*(int *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + _mm_cvtsi32_si128(*(int *)(s + 6 * p))); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + store_buffer_horz_8(q0p0, p, 0, s); + store_buffer_horz_8(q1p1, p, 1, s); + store_buffer_horz_8(q2p2, p, 2, s); + store_buffer_horz_8(q3p3, p, 3, s); + store_buffer_horz_8(q4p4, p, 4, s); + store_buffer_horz_8(q5p5, p, 5, s); +} + +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), + _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + p2_16); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_shft1 = _mm_srli_epi16(workp_a, 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), + p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_add_epi16(q1_16, q2_16); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), + p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); +} + +void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +static AOM_FORCE_INLINE void lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi64(*p3, *q3); + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + { + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p)); + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); +} + +void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), + _mm_loadl_epi64((__m128i *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), + _mm_loadl_epi64((__m128i *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), + _mm_loadl_epi64((__m128i *)(s + 6 * p))); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); +} + +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i p1, p0, q0, q1; + __m128i qs1qs0, ps1ps0; + + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); +} + +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i p0, q0, q1, p1; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i qs1qs0, ps1ps0; + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + x0 = _mm_loadl_epi64((__m128i *)((s - 2))); + x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); + + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, + &q1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + p1 = _mm_srli_si128(ps1ps0, 8); + q1 = _mm_srli_si128(qs1qs0, 8); + + transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, + &d5, &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x2, x1, x0, x3; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + __m128i p0, q0; + __m128i x2, x1, x0, x3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + // Loop filtering + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, + &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d1, d3, d5, d7; + __m128i q1q0, p1p0; + __m128i p1, q1; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); + _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); + _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); +} + +void aom_lpf_vertical_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); +} + +void aom_lpf_vertical_14_dual_sse2( + unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x7, x6, x5, x4, x3, x2, x1, x0; + __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; + __m128i q0, q1, q2, q3, q7; + __m128i p0p1, p2p3, p4p5, p6p7; + + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); + + transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, + &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); + + q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); + q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); + q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); + q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); + q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); + q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); + q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); + q7 = _mm_srli_si128(d14d15, 8); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, + &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); +} diff --git a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 000000000..8970fe7dd --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" + +static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5) { + __m128i w0, w1, w2, w3, w4, w5, ww0; + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 + *d1 = _mm_unpackhi_epi64(ww0, + _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + *d2 = _mm_unpacklo_epi64(ww0, + _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx + + w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx + w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx + w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx + + *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 + + ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 + *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 + *d5 = _mm_unpackhi_epi64(ww0, + _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx +} + +static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i zero = _mm_setzero_si128(); + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + + *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx + *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx + *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx + *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx +} + +static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, ww2, ww3; + __m128i zero = _mm_setzero_si128(); + + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + + ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + + *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx + *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx + *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx + *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); + highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); +} + +static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 + w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose8x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); + highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); +} + +// here in and out pointers (x and d arrays) should be different! we don't store +// their values inside +static INLINE void highbd_transpose8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, + d5, d6, d7); + highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, + x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, + d4 + 1, d5 + 1, d6 + 1, d7 + 1); +} + +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 000000000..584b5e7e3 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4); +HIGHBD_MASKSADMXN_AVX2(4, 8); +HIGHBD_MASKSADMXN_AVX2(8, 4); +HIGHBD_MASKSADMXN_AVX2(8, 8); +HIGHBD_MASKSADMXN_AVX2(8, 16); +HIGHBD_MASKSADMXN_AVX2(16, 8); +HIGHBD_MASKSADMXN_AVX2(16, 16); +HIGHBD_MASKSADMXN_AVX2(16, 32); +HIGHBD_MASKSADMXN_AVX2(32, 16); +HIGHBD_MASKSADMXN_AVX2(32, 32); +HIGHBD_MASKSADMXN_AVX2(32, 64); +HIGHBD_MASKSADMXN_AVX2(64, 32); +HIGHBD_MASKSADMXN_AVX2(64, 64); +HIGHBD_MASKSADMXN_AVX2(64, 128); +HIGHBD_MASKSADMXN_AVX2(128, 64); +HIGHBD_MASKSADMXN_AVX2(128, 128); +HIGHBD_MASKSADMXN_AVX2(4, 16); +HIGHBD_MASKSADMXN_AVX2(16, 4); +HIGHBD_MASKSADMXN_AVX2(8, 32); +HIGHBD_MASKSADMXN_AVX2(32, 8); +HIGHBD_MASKSADMXN_AVX2(16, 64); +HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 000000000..493f9bd8f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdio.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + +// For width a multiple of 16 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + else \ + return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ + else \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ + else \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return (sad + 31) >> 6; +} + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); + const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); + const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi8(a0, b0); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpacklo_epi8(a1, b1); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return (sad + 31) >> 6; +} + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + // Load two rows at a time, this seems to be a bit faster + // than four rows at a time in this case. + const __m128i src = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(uint32_t *)src_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); + const __m128i m = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data = _mm_unpacklo_epi8(a, b); + const __m128i mask = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_16bit = _mm_maddubs_epi16(data, mask); + pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + // At this point, the SAD is stored in lane 0 of 'res' + int32_t sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; +} + +// For width a multiple of 8 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ + else \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ + } + +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) +HIGHBD_MASKSAD4XN_SSSE3(16) +HIGHBD_MASKSADMXN_SSSE3(16, 4) +HIGHBD_MASKSADMXN_SSSE3(8, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 8) +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; +} + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), + _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), + _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; +} diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 000000000..cffbd9672 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c new file mode 100644 index 000000000..d7dbefd7d --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -0,0 +1,1064 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> +#include <string.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" + +// For width a multiple of 16 +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h); + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +// For width a multiple of 16 +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_); + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +#define MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * W]; \ + \ + bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, sse, &sum); \ + else \ + masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR8XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 8]; \ + \ + bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ + } + +#define MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 4]; \ + \ + bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } + +MASK_SUBPIX_VAR_SSSE3(128, 128) +MASK_SUBPIX_VAR_SSSE3(128, 64) +MASK_SUBPIX_VAR_SSSE3(64, 128) +MASK_SUBPIX_VAR_SSSE3(64, 64) +MASK_SUBPIX_VAR_SSSE3(64, 32) +MASK_SUBPIX_VAR_SSSE3(32, 64) +MASK_SUBPIX_VAR_SSSE3(32, 32) +MASK_SUBPIX_VAR_SSSE3(32, 16) +MASK_SUBPIX_VAR_SSSE3(16, 32) +MASK_SUBPIX_VAR_SSSE3(16, 16) +MASK_SUBPIX_VAR_SSSE3(16, 8) +MASK_SUBPIX_VAR8XH_SSSE3(16) +MASK_SUBPIX_VAR8XH_SSSE3(8) +MASK_SUBPIX_VAR8XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(8) +MASK_SUBPIX_VAR4XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(16) +MASK_SUBPIX_VAR_SSSE3(16, 4) +MASK_SUBPIX_VAR8XH_SSSE3(32) +MASK_SUBPIX_VAR_SSSE3(32, 8) +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) + +static INLINE __m128i filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a, b); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi8(a, b); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + __m128i z = _mm_alignr_epi8(y, x, 1); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + const __m128i z = _mm_alignr_epi8(y, x, 1); + const __m128i res = filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0, + const __m128i a1, const __m128i b1, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a0, b0); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi8(a1, b1); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 8; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 8; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 16; + } + // Handle i = h separately + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + + __m128i v0 = _mm_unpacklo_epi8(x0, z0); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 8; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); + const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = xx_loadl_32((__m128i *)src); + xx_storel_32((__m128i *)b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 4; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i x0 = _mm_loadl_epi64((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); + const __m128i z2 = _mm_srli_si128(x2, 1); + const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); + const __m128i z3 = _mm_srli_si128(x3, 1); + + const __m128i a0 = _mm_unpacklo_epi32(x0, x1); + const __m128i b0 = _mm_unpacklo_epi32(z0, z1); + const __m128i a1 = _mm_unpacklo_epi32(x2, x3); + const __m128i b1 = _mm_unpacklo_epi32(z2, z3); + const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 4; + b += 16; + } + // Handle i = h separately + const __m128i x = _mm_loadl_epi64((__m128i *)src); + const __m128i z = _mm_srli_si128(x, 1); + + __m128i v0 = _mm_unpacklo_epi8(x, z); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = xx_loadl_32((__m128i *)dst); + __m128i y = xx_loadl_32((__m128i *)&dst[4]); + xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i a = xx_loadl_32((__m128i *)dst); + const __m128i b = xx_loadl_32((__m128i *)&dst[4]); + const __m128i c = xx_loadl_32((__m128i *)&dst[8]); + const __m128i d = xx_loadl_32((__m128i *)&dst[12]); + const __m128i e = xx_loadl_32((__m128i *)&dst[16]); + + const __m128i a0 = _mm_unpacklo_epi32(a, b); + const __m128i b0 = _mm_unpacklo_epi32(b, c); + const __m128i a1 = _mm_unpacklo_epi32(c, d); + const __m128i b1 = _mm_unpacklo_epi32(d, e); + const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static INLINE void accumulate_block(const __m128i src, const __m128i a, + const __m128i b, const __m128i m, + __m128i *sum, __m128i *sum_sq) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi8(src, zero); + const __m128i src_r = _mm_unpackhi_epi8(src, zero); + const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); + const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); + + // Update partial sums and partial sums of squares + *sum = + _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); + *sum_sq = + _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), + _mm_madd_epi16(diff_r, diff_r))); +} + +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_) { + int x, y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + accumulate_block(src, a, b, m, &sum, &sum_sq); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + accumulate_block(src, a, b, m, &sum, &sum_sq); + + src_ptr += src_stride * 2; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 4) { + // Load four rows at a time + __m128i src = + _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride], + *(uint32_t *)&src_ptr[src_stride * 2], + *(uint32_t *)&src_ptr[src_stride * 3]); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_setr_epi32( + *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride], + *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]); + accumulate_block(src, a, b, m, &sum, &sum_sq); + + src_ptr += src_stride * 4; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 4; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +// For width a multiple of 8 +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h); + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h); + +// For width a multiple of 8 +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_); + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_); + +#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)sse64; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)sse_; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) + +static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a, b); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi16(a, b); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + __m128i z = _mm_alignr_epi8(y, x, 2); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + const __m128i z = _mm_alignr_epi8(y, x, 2); + const __m128i res = highbd_filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = highbd_filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i highbd_filter_block_2rows(const __m128i a0, + const __m128i b0, + const __m128i a1, + const __m128i b1, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a0, b0); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi16(a1, b1); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); + src += src_stride; + b += 4; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 2); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 2); + const __m128i res = + highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 8; + } + // Process i = h separately + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + + __m128i v0 = _mm_unpacklo_epi16(x, z); + v0 = _mm_madd_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 8; + } + } +} + +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_) { + int x, y; + // Note on bit widths: + // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, + // so this can be kept as four 32-bit values. + // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, + // so this must be stored as two 64-bit values. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + // Calculate 8 predicted pixels. + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit + // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. + // So we can re-pack into 16-bit fields and use _mm_madd_epi16 + // to calculate the squares and partially sum them. + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + // Then we want to sign-extend to 64 bits and accumulate + const __m128i sign = _mm_srai_epi32(prod, 31); + const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); + const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); + sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, zero); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); + _mm_storel_epi64((__m128i *)sse, sum_sq); +} + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_) { + int y; + // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). + // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 + // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. + // So we can safely pack sum_sq into 32-bit fields, which is slightly more + // convenient. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + sum_sq = _mm_add_epi32(sum_sq, prod); + + src_ptr += src_stride * 2; + a_ptr += 8; + b_ptr += 8; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + assert(height % 2 == 0); + int i = 0; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, + mask + mask_stride, comp_pred + width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } else { // width == 32 + assert(width == 32); + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16); + comp_pred += (width); + src0 += (stride0); + src1 += (stride1); + mask += (mask_stride); + i += 1; + } while (i < height); + } +} diff --git a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h new file mode 100644 index 000000000..4faa098ac --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ + +#include <stdlib.h> +#include <string.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" + +static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, + const uint8_t *src1, + const uint8_t *mask, uint8_t *dst) { + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); + const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); + const __m128i aA = _mm_load_si128((const __m128i *)(mask)); + + const __m128i maA = _mm_sub_epi8(alpha_max, aA); + + const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); + const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); + const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); + const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); + + const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); + const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); + + const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); + const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); + _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); +} + +static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + const uint8_t *mask, + int mask_stride) { + int i = 0; + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); + // even line B + const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); + const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + // comp_pred's stride == width == 8 + _mm_store_si128((__m128i *)(comp_pred), round); + comp_pred += (8 << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/media/libaom/src/aom_dsp/x86/mem_sse2.h b/media/libaom/src/aom_dsp/x86/mem_sse2.h new file mode 100644 index 000000000..6c821673e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/mem_sse2.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, + const int byte_stride) { + return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), + *(const int32_t *)((int8_t *)src + 1 * byte_stride), + *(const int32_t *)((int8_t *)src + 2 * byte_stride), + *(const int32_t *)((int8_t *)src + 3 * byte_stride)); +} + +static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, + const int byte_stride) { + __m128i dst; + dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); + dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); + return dst; +} + +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h b/media/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 000000000..5181e444c --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include <smmintrin.h> + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/media/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h b/media/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h new file mode 100644 index 000000000..48486c6c4 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" + +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} + +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/media/libaom/src/aom_dsp/x86/obmc_sad_avx2.c b/media/libaom/src/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 000000000..2aa2a0555 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/obmc_sad_sse4.c b/media/libaom/src/aom_dsp/x86/obmc_sad_sse4.c new file mode 100644 index 000000000..0338a8c77 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_sad_sse4.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/obmc_variance_avx2.c b/media/libaom/src/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 000000000..bfec0e8a8 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c new file mode 100644 index 000000000..72eda0e57 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) + +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum += xx_hsum_epi32_si64(v_sum_d); + *sse += xx_hsum_epi32_si64(v_sse_d); +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w < 128 || h < 128) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + assert(w == 128 && h == 128); + + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * w; + mask += 64 * w; + h -= 64; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + int max_pel_allowed_per_ovf = 512; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w * h <= max_pel_allowed_per_ovf) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + int h_per_ovf = max_pel_allowed_per_ovf / w; + + assert(max_pel_allowed_per_ovf % w == 0); + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + h_per_ovf); + pre8 += h_per_ovf * pre_stride; + wsrc += h_per_ovf * w; + mask += h_per_ovf * w; + h -= h_per_ovf; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HBD_OBMCVARWXH(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_OBMCVARWXH(128, 128) +HBD_OBMCVARWXH(128, 64) +HBD_OBMCVARWXH(64, 128) +HBD_OBMCVARWXH(64, 64) +HBD_OBMCVARWXH(64, 32) +HBD_OBMCVARWXH(32, 64) +HBD_OBMCVARWXH(32, 32) +HBD_OBMCVARWXH(32, 16) +HBD_OBMCVARWXH(16, 32) +HBD_OBMCVARWXH(16, 16) +HBD_OBMCVARWXH(16, 8) +HBD_OBMCVARWXH(8, 16) +HBD_OBMCVARWXH(8, 8) +HBD_OBMCVARWXH(8, 4) +HBD_OBMCVARWXH(4, 8) +HBD_OBMCVARWXH(4, 4) +HBD_OBMCVARWXH(4, 16) +HBD_OBMCVARWXH(16, 4) +HBD_OBMCVARWXH(8, 32) +HBD_OBMCVARWXH(32, 8) +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) diff --git a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm new file mode 100644 index 000000000..216a0bd8f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm @@ -0,0 +1,435 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + vzeroupper + +%ifnidn %1, b_32x32 + + ; Special case for ncoeff == 16, as it is frequent and we can save on + ; not setting up a loop. + cmp ncoeffmp, 16 + jne .generic + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Special case of ncoeff == 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.single: + + movifnidn coeffq, coeffmp + movifnidn zbinq, zbinmp + mova m0, [zbinq] ; m0 = zbin + + ; Get DC and first 15 AC coeffs - in this special case, that is all. + ; coeff stored as 32bit numbers but we process them as 16 bit numbers + mova m9, [coeffq] + packssdw m9, [coeffq+16] ; m9 = c[i] + mova m10, [coeffq+32] + packssdw m10, [coeffq+48] ; m10 = c[i] + + mov r0, eobmp ; Output pointer + mov r1, qcoeffmp ; Output pointer + mov r2, dqcoeffmp ; Output pointer + + pxor m5, m5 ; m5 = dedicated zero + + pcmpeqw m4, m4 ; All word lanes -1 + paddw m0, m4 ; m0 = zbin - 1 + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, we just write zeros + ; to the outputs and we are done. + por m14, m7, m12 + ptest m14, m14 + jnz .single_nonzero + + mova [r1 ], ymm5 + mova [r1+32], ymm5 + mova [r2 ], ymm5 + mova [r2+32], ymm5 + mov [r0], word 0 + + vzeroupper + RET + +.single_nonzero: + + ; Actual quantization of size 16 block - setup pointers, rounders, etc. + movifnidn r3, roundmp + movifnidn r4, quantmp + mov r6, dequantmp + mov r5, shiftmp + mova m1, [r3] ; m1 = round + mova m2, [r4] ; m2 = quant + mova m3, [r6] ; m3 = dequant + mova m4, [r5] ; m4 = shift + + mov r3, iscanmp + + DEFINE_ARGS eob, qcoeff, dqcoeff, iscan + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq ], m11 + mova [qcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+32], m11 + mova [qcoeffq+48], m6 + + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q + + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq ], m11 + mova [dqcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+32], m11 + mova [dqcoeffq+48], m6 + + mova m6, [iscanq] ; m6 = scan[i] + mova m11, [iscanq+16] ; m11 = scan[i] + + pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 + psubw m6, m6, m7 ; m6 = scan[i] + 1 + psubw m11, m11, m12 ; m11 = scan[i] + 1 + pandn m8, m8, m6 ; m8 = max(eob) + pandn m13, m13, m11 ; m13 = max(eob) + pmaxsw m8, m8, m13 + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [eobq], ax + + vzeroupper + RET + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Generic case of ncoeff != 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.generic: + +%endif ; %ifnidn %1, b_32x32 + +DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + ; Actual quantization loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant + mova m3, [dequantq] ; m3 = dequant + pcmpeqw m4, m4 ; All lanes -1 +%ifidn %1, b_32x32 + psubw m0, m4 + psubw m1, m4 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + paddw m0, m4 ; m0 = m0 + 1 + + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + + + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip forward quickly. + por m14, m7, m12 + ptest m14, m14 + jnz .first_nonzero + + mova [qcoeffq+ncoeffq*4 ], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4 ], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 + add ncoeffq, mmsize + + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + punpckhqdq m4, m4 + pxor m8, m8 + + jmp .ac_only_loop + +.first_nonzero: + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + +.ac_only_loop: + + ; pack coeff from 32bit to 16bit array + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip this itertion. + ; And just write zeros as the result would be. + por m14, m7, m12 + ptest m14, m14 + jnz .rest_nonzero + + mova [qcoeffq+ncoeffq*4+ 0], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4+ 0], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 + + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + +.rest_nonzero: + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET +%endmacro + +INIT_XMM avx +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/media/libaom/src/aom_dsp/x86/quantize_sse2.c b/media/libaom/src/aom_dsp/x86/quantize_sse2.c new file mode 100644 index 000000000..d3de6e24d --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/quantize_sse2.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include <xmmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +} + +void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan_ptr; + + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..39d4ca674 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,272 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [dequantq] ; m3 = dequant + mov r2, shiftmp + psubw m0, [GLOBAL(pw_1)] + mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/media/libaom/src/aom_dsp/x86/quantize_x86.h b/media/libaom/src/aom_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..4eed7dd29 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c new file mode 100644 index 000000000..f662b62b1 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 5; + rf[0] += ref_stride << 5; + rf[1] += ref_stride << 5; + rf[2] += ref_stride << 5; + rf[3] += ref_stride << 5; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + unsigned int half_width = 32; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} diff --git a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm new file mode 100644 index 000000000..55a856985 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,257 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + movd m1, [srcq +%4] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q+%3] + movd m5, [ref1q+%5] + movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 + movd m4, [ref4q+%3] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq +%4] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_128x2x4 5-6 0 + PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64 + PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6 +%endmacro + +; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 128, 128 +SADNXN4D 128, 64 +SADNXN4D 64, 128 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 +SADNXN4D 4, 16 +SADNXN4D 16, 4 +SADNXN4D 8, 32 +SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 diff --git a/media/libaom/src/aom_dsp/x86/sad_avx2.c b/media/libaom/src/aom_dsp/x86/sad_avx2.c new file mode 100644 index 000000000..a50dba64a --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad_avx2.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +#define FSAD64_H(h) \ + unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD32_H(h) \ + unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD64 \ + FSAD64_H(64); \ + FSAD64_H(32); + +#define FSAD32 \ + FSAD32_H(64); \ + FSAD32_H(32); \ + FSAD32_H(16); + +/* clang-format off */ +FSAD64 +FSAD32 +/* clang-format on */ + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ + unsigned int aom_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG32_H(h) \ + unsigned int aom_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64); \ + FSADAVG64_H(32); + +#define FSADAVG32 \ + FSADAVG32_H(64); \ + FSADAVG32_H(32); \ + FSADAVG32_H(16); + +/* clang-format off */ +FSADAVG64 +FSADAVG32 +/* clang-format on */ + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c new file mode 100644 index 000000000..b506d4663 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c @@ -0,0 +1,1038 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_ports/mem.h" + +// SAD +static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { + // input 8 32-bit summation + __m128i lo128, hi128; + __m256i u = _mm256_srli_si256(*v, 8); + u = _mm256_add_epi32(u, *v); + + // 4 32-bit summation + hi128 = _mm256_extracti128_si256(u, 1); + lo128 = _mm256_castsi256_si128(u); + lo128 = _mm_add_epi32(hi128, lo128); + + // 2 32-bit summation + hi128 = _mm_srli_si128(lo128, 4); + lo128 = _mm_add_epi32(lo128, hi128); + + return (unsigned int)_mm_cvtsi128_si32(lo128); +} + +unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + + // first 4 rows + __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + __m256i u0 = _mm256_sub_epi16(s0, r0); + __m256i u1 = _mm256_sub_epi16(s1, r1); + __m256i u2 = _mm256_sub_epi16(s2, r2); + __m256i u3 = _mm256_sub_epi16(s3, r3); + __m256i zero = _mm256_setzero_si256(); + __m256i sum0, sum1; + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + // second 4 rows + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum1 = _mm256_add_epi16(u0, u1); + sum1 = _mm256_add_epi16(sum1, u2); + sum1 = _mm256_add_epi16(sum1, u3); + + // find out the SAD + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + r0 = _mm256_unpacklo_epi16(sum1, zero); + r1 = _mm256_unpackhi_epi16(sum1, zero); + s0 = _mm256_add_epi32(s0, s1); + r0 = _mm256_add_epi32(r0, r1); + sum0 = _mm256_add_epi32(s0, r0); + // 8 32-bit summation + + return (unsigned int)get_sad_from_mm256_epi32(&sum0); +} + +unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3; + __m256i sum0; + __m256i sum = _mm256_setzero_si256(); + const __m256i zero = _mm256_setzero_si256(); + int row = 0; + + // Loop for every 4 rows + while (row < 16) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + sum = _mm256_add_epi32(sum, s0); + sum = _mm256_add_epi32(sum, s1); + // 8 32-bit summation + + row += 4; + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + } + return get_sad_from_mm256_epi32(&sum); +} + +static void sad32x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + int row_sections = 0; + + while (row_sections < 2) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16( + r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16( + r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16( + r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); + + row_sections += 1; + src_ptr += src_stride << 1; + ref_ptr += ref_stride << 1; + if (sec_ptr) sec_ptr += 32 << 1; + } +} + +unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +static void sad64x2(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, NULL, &sad); + srcp += src_stride; + refp += ref_stride; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16(r1, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16(r2, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16(r3, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); +} + +unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + // Next 4 rows + srcp += src_stride << 2; + refp += ref_stride << 2; + secp += 64; + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 3; + uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 6; + uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, secp, &sad); + srcp += src_stride; + refp += ref_stride; + secp += 16 << 3; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + unsigned int sum; + const int left_shift = 6; + + sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 128 << left_shift; + sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +// SAD 4D +// Combine 4 __m256i vectors to uint32_t result[4] +static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, + uint32_t *res) { + __m256i u0, u1, u2, u3; + const __m256i mask = yy_set1_64_from_32i(UINT32_MAX); + __m128i sad; + + // 8 32-bit summation + u0 = _mm256_srli_si256(v[0], 4); + u1 = _mm256_srli_si256(v[1], 4); + u2 = _mm256_srli_si256(v[2], 4); + u3 = _mm256_srli_si256(v[3], 4); + + u0 = _mm256_add_epi32(u0, v[0]); + u1 = _mm256_add_epi32(u1, v[1]); + u2 = _mm256_add_epi32(u2, v[2]); + u3 = _mm256_add_epi32(u3, v[3]); + + u0 = _mm256_and_si256(u0, mask); + u1 = _mm256_and_si256(u1, mask); + u2 = _mm256_and_si256(u2, mask); + u3 = _mm256_and_si256(u3, mask); + // 4 32-bit summation, evenly positioned + + u1 = _mm256_slli_si256(u1, 4); + u3 = _mm256_slli_si256(u3, 4); + + u0 = _mm256_or_si256(u0, u1); + u2 = _mm256_or_si256(u2, u3); + // 8 32-bit summation, interleaved + + u1 = _mm256_unpacklo_epi64(u0, u2); + u3 = _mm256_unpackhi_epi64(u0, u2); + + u0 = _mm256_add_epi32(u1, u3); + sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), + _mm256_castsi256_si128(u0)); + _mm_storeu_si128((__m128i *)res, sad); +} + +static void convert_pointers(const uint8_t *const ref8[], + const uint16_t *ref[]) { + ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); + ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); + ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); + ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); +} + +static void init_sad(__m256i *s) { + s[0] = _mm256_setzero_si256(); + s[1] = _mm256_setzero_si256(); + s[2] = _mm256_setzero_si256(); + s[3] = _mm256_setzero_si256(); +} + +void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first8rows[4]; + uint32_t second8rows[4]; + const uint8_t *ref[4]; + const int shift_for_8_rows = 3; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows); + src += src_stride << shift_for_8_rows; + ref[0] += ref_stride << shift_for_8_rows; + ref[1] += ref_stride << shift_for_8_rows; + ref[2] += ref_stride << shift_for_8_rows; + ref[3] += ref_stride << shift_for_8_rows; + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows); + sad_array[0] = first8rows[0] + second8rows[0]; + sad_array[1] = first8rows[1] + second8rows[1]; + sad_array[2] = first8rows[2] + second8rows[2]; + sad_array[3] = first8rows[3] + second8rows[3]; +} + +void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 4) { + sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_rows = 1; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 16) { + sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); + srcp += src_stride << shift_for_rows; + refp[i] += ref_stride << shift_for_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 64) { + sad128x1(srcp, refp[i], NULL, &sad_vec[i]); + srcp += src_stride; + refp[i] += ref_stride; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} diff --git a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c new file mode 100644 index 000000000..c6fd62c9e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + __m256i s1, s2, r1, r2; + __m256i sum = _mm256_setzero_si256(); + __m128i sum_i128; + int i; + + for (i = 0; i < 16; ++i) { + r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); + r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); + s2 = _mm256_sad_epu8( + r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); + ref_ptr += ref_stride << 1; + src_ptr += src_stride << 1; + } + + sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); + sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), + _mm256_castsi256_si128(sum)); + return _mm_cvtsi128_si32(sum_i128); +} + +static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 32; + uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 5; + ref_ptr += ref_stride << 5; + sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 64; + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static void sad64x64x4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + __m128i *res) { + uint32_t sum[4]; + aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum); + *res = _mm_loadu_si128((const __m128i *)sum); +} + +void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + unsigned int half_width = 64; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int h, const uint8_t *second_pred, + const int second_pred_stride) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + ref1_reg = _mm256_avg_epu8( + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); + ref2_reg = _mm256_avg_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + second_pred += second_pred_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + + return res; +} + +unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 64 << 6; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + return sum; +} + +unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + unsigned int half_width = 64; + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + src_ptr += half_width; + ref_ptr += half_width; + second_pred += half_width; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + return sum; +} + +unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 128 << 6; + sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred); + return sum; +} diff --git a/media/libaom/src/aom_dsp/x86/sad_sse2.asm b/media/libaom/src/aom_dsp/x86/sad_sse2.asm new file mode 100644 index 000000000..3251b7655 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sad_sse2.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD128XN 1-2 0 + SAD_FN 128, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*4] + pavgb m2, [second_predq+mmsize*5] + pavgb m3, [second_predq+mmsize*6] + pavgb m4, [second_predq+mmsize*7] + lea second_predq, [second_predq+mmsize*8] +%endif + psadbw m1, [srcq+64] + psadbw m2, [srcq+80] + psadbw m3, [srcq+96] + psadbw m4, [srcq+112] + + add refq, ref_strideq + add srcq, src_strideq + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + sub n_rowsd, 1 + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD128XN 128 ; sad128x128_sse2 +SAD128XN 128, 1 ; sad128x128_avg_sse2 +SAD128XN 64 ; sad128x64_sse2 +SAD128XN 64, 1 ; sad128x64_avg_sse2 + + +; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 128 ; sad64x128_sse2 +SAD64XN 128, 1 ; sad64x128_avg_sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 + +; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 8 ; sad_32x8_sse2 +SAD32XN 8, 1 ; sad_32x8_avg_sse2 + +; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 4 ; sad_16x4_sse2 +SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 + +; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 32 ; sad_8x32_sse2 +SAD8XN 32, 1 ; sad_8x32_avg_sse2 + +; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 16 ; sad_4x16_sse2 +SAD4XN 16, 1 ; sad_4x16_avg_sse2 diff --git a/media/libaom/src/aom_dsp/x86/sse_avx2.c b/media/libaom/src/aom_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..305dde5c0 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <smmintrin.h> +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); + const __m256i v_a01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); + const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); + const __m256i v_b01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} + +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + return sse; +} diff --git a/media/libaom/src/aom_dsp/x86/sse_sse4.c b/media/libaom/src/aom_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..8b5af8469 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + return sse; +} diff --git a/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm b/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm new file mode 100644 index 000000000..6d9b5a12f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_16x16_sse2) PRIVATE +sym(aom_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_8x8_sse2) PRIVATE +sym(aom_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 000000000..45bf6ec3c --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1481 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define block_height heightd + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/media/libaom/src/aom_dsp/x86/subtract_avx2.c b/media/libaom/src/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4389d123d --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static INLINE void aom_subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 32: + aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 64: + aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 128: + aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/media/libaom/src/aom_dsp/x86/subtract_sse2.asm b/media/libaom/src/aom_dsp/x86/subtract_sse2.asm new file mode 100644 index 000000000..1a75a234f --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/subtract_sse2.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void aom_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + cmp colsd, 64 + je .case_64 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_128: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize + loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + sub rowsd, 1 + jnz .loop_128 + RET + +.case_64: + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 000000000..0af44e3a4 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include <smmintrin.h> + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c new file mode 100644 index 000000000..22d7739ec --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include <stdio.h> + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { + const __m128d ad = _mm_castsi128_pd(a); + return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); +} + +static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(a); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, a); + return tmp; + } +#endif +} + +static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + + return _mm_add_epi32(v_sq_01_d, v_sq_23_d); +} + +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); + __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { + int r = 0; + __m128i v_acc_q = _mm_setzero_si128(); + do { + const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); + v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); + src += stride << 2; + r += 4; + } while (r < height); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), + _mm_and_si128(v_acc_q, v_zext_mask_q)); + v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); + return xx_cvtsi128_si64(v_acc_64); +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +uint64_t +aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height) { + int r = 0; + + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + + do { + __m128i v_acc_d = _mm_setzero_si128(); + int c = 0; + do { + const int16_t *b = src + c; + + const __m128i v_val_0_w = xx_load_128(b + 0 * stride); + const __m128i v_val_1_w = xx_load_128(b + 1 * stride); + const __m128i v_val_2_w = xx_load_128(b + 2 * stride); + const __m128i v_val_3_w = xx_load_128(b + 3 * stride); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + c += 8; + } while (c < width); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 4 * stride; + r += 4; + } while (r < height); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + return xx_cvtsi128_si64(v_acc_q); +} + +uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc0_q = _mm_setzero_si128(); + __m128i v_acc1_q = _mm_setzero_si128(); + + const int16_t *const end = src + n; + + assert(n % 64 == 0); + + while (src < end) { + const __m128i v_val_0_w = xx_load_128(src); + const __m128i v_val_1_w = xx_load_128(src + 8); + const __m128i v_val_2_w = xx_load_128(src + 16); + const __m128i v_val_3_w = xx_load_128(src + 24); + const __m128i v_val_4_w = xx_load_128(src + 32); + const __m128i v_val_5_w = xx_load_128(src + 40); + const __m128i v_val_6_w = xx_load_128(src + 48); + const __m128i v_val_7_w = xx_load_128(src + 56); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); + v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); + + src += 64; + } + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + return xx_cvtsi128_si64(v_acc0_q); +} + +uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { + if (n % 64 == 0) { + return aom_sum_squares_i16_64n_sse2(src, n); + } else if (n > 64) { + int k = n & ~(64 - 1); + return aom_sum_squares_i16_64n_sse2(src, k) + + aom_sum_squares_i16_c(src + k, n - k); + } else { + return aom_sum_squares_i16_c(src, n); + } +} diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 000000000..491e31cc5 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/synonyms.h b/media/libaom/src/aom_dsp/x86/synonyms.h new file mode 100644 index 000000000..1e9f1e27b --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/synonyms.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m128i xx_loadl_32(const void *a) { + return _mm_cvtsi32_si128(*(const uint32_t *)a); +} + +static INLINE __m128i xx_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i *)a); +} + +static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); +} + +static INLINE void xx_storel_32(void *const a, const __m128i v) { + *(uint32_t *)a = _mm_cvtsi128_si32(v); +} + +static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +} + +static INLINE void xx_store_128(void *const a, const __m128i v) { + _mm_store_si128((__m128i *)a, v); +} + +static INLINE void xx_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i *)a, v); +} + +// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set_epi64x() +// acting on 32-bit integers. +static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, e1, 0, e0); +#else + return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); +#endif +} + +// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m128i xx_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, a, 0, a); +#else + return _mm_set1_epi64x((uint32_t)a); +#endif +} + +static INLINE __m128i xx_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); + const __m128i v_tmp_d = + _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/media/libaom/src/aom_dsp/x86/synonyms_avx2.h b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h new file mode 100644 index 000000000..3f69b120e --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m256i yy_load_256(const void *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE __m256i yy_loadu_256(const void *a) { + return _mm256_loadu_si256((const __m256i *)a); +} + +static INLINE void yy_store_256(void *const a, const __m256i v) { + _mm256_store_si256((__m256i *)a, v); +} + +static INLINE void yy_storeu_256(void *const a, const __m256i v) { + _mm256_storeu_si256((__m256i *)a, v); +} + +// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm256_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m256i yy_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); +#else + return _mm256_set1_epi64x((uint32_t)a); +#endif +} + +// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We +// therefore define an equivalent function using a different intrinsic. +// ([ hi ], [ lo ]) -> [ hi ][ lo ] +static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/media/libaom/src/aom_dsp/x86/transpose_sse2.h b/media/libaom/src/aom_dsp/x86/transpose_sse2.h new file mode 100644 index 000000000..d0d1ee684 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/transpose_sse2.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ + +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..b1611ba87 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include <emmintrin.h> +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f + // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f + // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f + // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f + // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f + // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f + // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f + // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f + // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // ... + __m256i a[16]; + for (int i = 0; i < 16; i += 2) { + a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); + a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); + } + __m256i b[16]; + for (int i = 0; i < 16; i += 2) { + b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); + b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); + } + __m256i c[16]; + for (int i = 0; i < 16; i += 2) { + c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); + c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); + } + out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); + out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); + out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); + out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); + + out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); + out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); + out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); + out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); + + out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); + out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); + out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); + out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); + + out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); + out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); + out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); + out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h new file mode 100644 index 000000000..ed82eee96 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include <emmintrin.h> +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) + +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/media/libaom/src/aom_dsp/x86/variance_avx2.c b/media/libaom/src/aom_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..800aef126 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/variance_avx2.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" + +static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { + return _mm_add_epi16(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { + return _mm_add_epi32(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} + +static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + return _mm_extract_epi32(res, 1); +} + +// handle pixels (<= 512) +static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); + return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); +} + +// handle 1024 pixels (32x32, 16x64, 64x16) +static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = + _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +// handle 2048 pixels (32x64, 64x32) +static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + vsum = sum_to_32bit_avx2(vsum); + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); + return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); + variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512); +AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512); +AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512); +AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512); +AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048); + +AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048); + +#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum = _mm256_setzero_si256(); \ + for (int i = 0; i < (bh / uh); i++) { \ + __m256i vsum16; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ + src += uh * src_stride; \ + ref += uh * ref_stride; \ + } \ + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ + const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32) +AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32) +AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16) +AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16) + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sseptr); + +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); + +static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, + const __m256i a, + uint8_t *comp_pred) { + const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; + const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); + + const __m256i ma = _mm256_sub_epi8(alpha_max, a); + + const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); + const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); + const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); + const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); + + const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); + const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); + const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); + const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); + + const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); + _mm256_storeu_si256((__m256i *)(comp_pred), roundA); +} + +void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i = 0; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aA = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aB = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + // comp_pred's stride == width == 16 + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (16 << 2); + i += 4; + } while (i < height); + } else { // for width == 32 + do { + const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0)); + const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1)); + const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask)); + + const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0)); + const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1)); + const __m256i aB = + _mm256_lddqu_si256((const __m256i *)(mask + mask_stride)); + + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (32 << 1); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } +} + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + const __m256i m23_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c new file mode 100644 index 000000000..88e27aef3 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; +/* clang-format on */ + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} diff --git a/media/libaom/src/aom_dsp/x86/variance_impl_ssse3.c b/media/libaom/src/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 000000000..66b0d7d84 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/media/libaom/src/aom_dsp/x86/variance_sse2.c b/media/libaom/src/aom_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..3c37e77c0 --- /dev/null +++ b/media/libaom/src/aom_dsp/x86/variance_sse2.c @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_ports/mem.h" + +#include "av1/common/filter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" + +unsigned int aom_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = xx_loadu_128(src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); + return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); +} + +static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); + return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); +} + +// Accumulate 4 32bit numbers in val to 1 32bit number +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return _mm_cvtsi128_si32(val); +} + +// Accumulate 8 16bit in sum to 4 32bit number +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src, ref); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +// diff sum of 128 pixels can still fit in 16bit integer +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); +} + +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} + +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = add32x4_sse2(vsum); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = sum_to_32bit_sse2(vsum); + *sum = add32x4_sse2(vsum); +} + +static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 256); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src, src_stride); + const __m128i r = load4x2_sse2(ref, ref_stride); + + variance_kernel_sse2(s, r, sse, sum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 128); // May overflow for larger height. + *sum = _mm_setzero_si128(); + for (int i = 0; i < h; i++) { + const __m128i s = load8_8to16_sse2(src); + const __m128i r = load8_8to16_sse2(ref); + + variance_kernel_sse2(s, r, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src, + const uint8_t *const ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 64); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src, ref, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 16); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + variance16_kernel_sse2(src + 32, ref + 32, sse, sum); + variance16_kernel_sse2(src + 48, ref + 48, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 8); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < 4; ++j) { + const int offset0 = j << 5; + const int offset1 = offset0 + 16; + variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); + variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); + } + src += src_stride; + ref += ref_stride; + } +} + +#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum; \ + int sum = 0; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128); +AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128); +AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128); + +AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128); +AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128); +AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128); +AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256); + +AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128); +AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128); +AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256); +AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256); +AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024); + +#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum = _mm_setzero_si128(); \ + for (int i = 0; i < (bh / uh); ++i) { \ + __m128i vsum16; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ + src += (src_stride * uh); \ + ref += (ref_stride * uh); \ + } \ + *sse = add32x4_sse2(vsse); \ + int sum = add32x4_sse2(vsum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 ) + +AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024); +AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 ) +AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 ) +AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 ) + +AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 ) +AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 ) + +unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) + +FNS(sse2); +FNS(ssse3); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) + +FNS(sse2); +FNS(ssse3); + +#undef FNS +#undef FN + +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + + return; + } + } + + const InterpFilterParams *filter = + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width >= 16) { + int i; + assert(!(width & 15)); + /*Read 16 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 16) { + xx_storeu_128(comp_pred, xx_loadu_128(ref)); + comp_pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + int i; + assert(!(width & 7)); + assert(!(height & 1)); + /*Read 8 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); + comp_pred += 16; + ref += 2 * ref_stride; + } + } else { + int i; + assert(!(width & 3)); + assert(!(height & 3)); + /*Read 4 pixels four rows at a time.*/ + for (i = 0; i < height; i++) { + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); + comp_pred += 16; + ref += 4 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = + (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + // TODO(Deepa): Remove the memset below when we have + // 4 tap simd for sse2 and ssse3. + if (subpel_search == 1) { + memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); + } + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); + } +} + +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + for (i = 0; i < n; i++) { + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); + comp_pred += 16; + pred += 16; + } +} + +void aom_comp_mask_upsampled_pred_sse2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + for (int j = 0; j < 2; j++) { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} |