diff options
Diffstat (limited to 'media/libvpx/vp8/common/x86')
31 files changed, 16052 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/x86/copy_sse2.asm b/media/libvpx/vp8/common/x86/copy_sse2.asm new file mode 100644 index 000000000..86fae2695 --- /dev/null +++ b/media/libvpx/vp8/common/x86/copy_sse2.asm @@ -0,0 +1,93 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) PRIVATE +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/copy_sse3.asm b/media/libvpx/vp8/common/x86/copy_sse3.asm new file mode 100644 index 000000000..d789a40cc --- /dev/null +++ b/media/libvpx/vp8/common/x86/copy_sse3.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) PRIVATE +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/vp8/common/x86/dequantize_mmx.asm new file mode 100644 index 000000000..4e551f00a --- /dev/null +++ b/media/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -0,0 +1,258 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) +global sym(vp8_dequantize_b_impl_mmx) PRIVATE +sym(vp8_dequantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;sq + mov rdi, arg(1) ;dq + mov rax, arg(2) ;q + + movq mm1, [rsi] + pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. + movq [rdi], mm1 + + movq mm1, [rsi+8] + pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. + movq [rdi+8], mm1 + + movq mm1, [rsi+16] + pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. + movq [rdi+16], mm1 + + movq mm1, [rsi+24] + pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. + movq [rdi+24], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void dequant_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride) 3 +global sym(vp8_dequant_idct_add_mmx) PRIVATE +sym(vp8_dequant_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rdx, arg(1) ;dq + + + movq mm0, [rax ] + pmullw mm0, [rdx] + + movq mm1, [rax +8] + pmullw mm1, [rdx +8] + + movq mm2, [rax+16] + pmullw mm2, [rdx+16] + + movq mm3, [rax+24] + pmullw mm3, [rdx+24] + + mov rdx, arg(2) ;dest + + pxor mm7, mm7 + + + movq [rax], mm7 + movq [rax+8], mm7 + + movq [rax+16],mm7 + movq [rax+24],mm7 + + + movsxd rdi, dword ptr arg(3) ;stride + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rdx] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rdx+rdi] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/vp8/common/x86/filter_x86.c b/media/libvpx/vp8/common/x86/filter_x86.c new file mode 100644 index 000000000..7f496ed7d --- /dev/null +++ b/media/libvpx/vp8/common/x86/filter_x86.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/x86/filter_x86.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; diff --git a/media/libvpx/vp8/common/x86/filter_x86.h b/media/libvpx/vp8/common/x86/filter_x86.h new file mode 100644 index 000000000..d282841be --- /dev/null +++ b/media/libvpx/vp8/common/x86/filter_x86.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_X86_FILTER_X86_H_ +#define VP8_COMMON_X86_FILTER_X86_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with + * duplicated values */ + +/* duplicated 4x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); + +/* duplicated 8x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/media/libvpx/vp8/common/x86/idct_blk_mmx.c b/media/libvpx/vp8/common/x86/idct_blk_mmx.c new file mode 100644 index 000000000..f2532b34d --- /dev/null +++ b/media/libvpx/vp8/common/x86/idct_blk_mmx.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); + +void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) +{ + short *sq = (short *) d->qcoeff; + short *dq = (short *) d->dqcoeff; + + vp8_dequantize_b_impl_mmx(sq, dq, DQC); +} + +void vp8_dequant_idct_add_y_block_mmx + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride, + dst+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride, + dst+8, stride); + memset(q + 32, 0, 2 * sizeof(q[0])); + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride, + dst+12, stride); + memset(q + 48, 0, 2 * sizeof(q[0])); + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_mmx + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/media/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/vp8/common/x86/idct_blk_sse2.c new file mode 100644 index 000000000..ae96ec858 --- /dev/null +++ b/media/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +void vp8_idct_dequant_0_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); +void vp8_idct_dequant_full_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_y_block_sse2 + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride); + } + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride); + else + vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride); + } + q += 64; + dst += stride*4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_sse2 + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + dstu += stride*4; + + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)(eobs))[2] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } + q += 32; + dstv += stride*4; + + if (((short *)(eobs))[3]) + { + if (((short *)(eobs))[3] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } +} diff --git a/media/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/vp8/common/x86/idctllm_mmx.asm new file mode 100644 index 000000000..96fa2c60d --- /dev/null +++ b/media/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + + +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, +;int pitch, unsigned char *dest,int stride) +global sym(vp8_short_idct4x4llm_mmx) PRIVATE +sym(vp8_short_idct4x4llm_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rsi, arg(1) ;pred + + movq mm0, [rax ] + movq mm1, [rax+ 8] + movq mm2, [rax+16] + movq mm3, [rax+24] + +%if 0 + pxor mm7, mm7 + movq [rax], mm7 + movq [rax+8], mm7 + movq [rax+16],mm7 + movq [rax+24],mm7 +%endif + movsxd rax, dword ptr arg(2) ;pitch + mov rdx, arg(3) ;dest + movsxd rdi, dword ptr arg(4) ;stride + + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rsi] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rsi+rax] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + add rsi, rax + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_dc_only_idct_add_mmx( +;short input_dc, +;unsigned char *pred_ptr, +;int pred_stride, +;unsigned char *dst_ptr, +;int stride) +global sym(vp8_dc_only_idct_add_mmx) PRIVATE +sym(vp8_dc_only_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + movd mm5, arg(0) ;input_dc + mov rax, arg(1) ;pred_ptr + movsxd rdx, dword ptr arg(2) ;pred_stride + + pxor mm0, mm0 + + paddw mm5, [GLOBAL(fours)] + lea rcx, [rdx + rdx*2] + + psraw mm5, 3 + + punpcklwd mm5, mm5 + + punpckldq mm5, mm5 + + movd mm1, [rax] + movd mm2, [rax+rdx] + movd mm3, [rax+2*rdx] + movd mm4, [rax+rcx] + + mov rax, arg(3) ;d -- destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + punpcklbw mm1, mm0 + paddsw mm1, mm5 + packuswb mm1, mm0 ; pack and unpack to saturate + lea rcx, [rdx + rdx*2] + + punpcklbw mm2, mm0 + paddsw mm2, mm5 + packuswb mm2, mm0 ; pack and unpack to saturate + + punpcklbw mm3, mm0 + paddsw mm3, mm5 + packuswb mm3, mm0 ; pack and unpack to saturate + + punpcklbw mm4, mm0 + paddsw mm4, mm5 + packuswb mm4, mm0 ; pack and unpack to saturate + + movd [rax], mm1 + movd [rax+rdx], mm2 + movd [rax+2*rdx], mm3 + movd [rax+rcx], mm4 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/vp8/common/x86/idctllm_sse2.asm new file mode 100644 index 000000000..bf8e2c402 --- /dev/null +++ b/media/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -0,0 +1,708 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) + +global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; clear coeffs + movd [rax], xmm5 + movd [rax+32], xmm5 +;pshufb + mov rax, arg(2) ; dst + movsxd rdx, dword ptr arg(3) ; dst_stride + + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + lea rcx, [rdx + rdx*2] + paddw xmm4, [GLOBAL(fours)] + + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rdx] + movq xmm2, [rax+2*rdx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) +global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + mov rdi, arg(2) ; dst + + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + lea rcx, [rdx + rdx*2] ;dst_stride * 3 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+2*rdx] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + + mov rdi, arg(2) ; dst + mov rdx, arg(4) ; dc + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + movsxd rdx, dword ptr arg(3) ; dst_stride + lea rcx, [rdx + rdx*2] + ; Load up predict blocks + movq xmm0, [rdi] + movq xmm1, [rdi+rdx*1] + movq xmm2, [rdi+rdx*2] + movq xmm3, [rdi+rcx] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [GLOBAL(fours)] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_idct_dequant_dc_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + + mov rdi, arg(2) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(4) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movsxd rdx, dword ptr arg(3) ; dst_stride + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + lea rcx, [rdx + rdx*2] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+rdx*2] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/media/libvpx/vp8/common/x86/iwalsh_mmx.asm b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm new file mode 100644 index 000000000..158c3b745 --- /dev/null +++ b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm @@ -0,0 +1,140 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE +sym(vp8_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rdx, arg(0) + mov rax, 30003h + + movq mm0, [rdx + 0] ;ip[0] + movq mm1, [rdx + 8] ;ip[4] + movq mm7, rax + + movq mm2, [rdx + 16] ;ip[8] + movq mm3, [rdx + 24] ;ip[12] + punpcklwd mm7, mm7 ;0003000300030003h + mov rdx, arg(1) + + movq mm4, mm0 + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + paddw mm1, mm7 + paddw mm6, mm7 + psraw mm1, 3 + psraw mm6, 3 + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl + paddw mm0, mm7 + paddw mm5, mm7 + psraw mm0, 3 + psraw mm5, 3 +;~~~~~~~~~~~~~~~~~~~~~ + + movd eax, mm1 + movd ecx, mm0 + psrlq mm0, 32 + psrlq mm1, 32 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*1], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*5], cx + movd eax, mm1 + movd ecx, mm0 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*9], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*13], cx + + movd eax, mm6 + movd ecx, mm5 + psrlq mm5, 32 + psrlq mm6, 32 + mov word ptr[rdx+32*2], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*6], ax + mov word ptr[rdx+32*7], cx + movd eax, mm6 + movd ecx, mm5 + mov word ptr[rdx+32*10], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*14], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + diff --git a/media/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/vp8/common/x86/iwalsh_sse2.asm new file mode 100644 index 000000000..06e86a80b --- /dev/null +++ b/media/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -0,0 +1,121 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE +sym(vp8_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h + + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] + + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm new file mode 100644 index 000000000..6d5aaa19d --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -0,0 +1,815 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro LF_ABS 2 + ; %1 value not preserved + ; %2 value preserved + ; output in %1 + movdqa scratch1, %2 ; v2 + + psubusb scratch1, %1 ; v2 - v1 + psubusb %1, %2 ; v1 - v2 + por %1, scratch1 ; abs(v2 - v1) +%endmacro + +%macro LF_FILTER_HEV_MASK 8-9 + + LF_ABS %1, %2 ; abs(p3 - p2) + LF_ABS %2, %3 ; abs(p2 - p1) + pmaxub %1, %2 ; accumulate mask +%if %0 == 8 + movdqa scratch2, %3 ; save p1 + LF_ABS scratch2, %4 ; abs(p1 - p0) +%endif + LF_ABS %4, %5 ; abs(p0 - q0) + LF_ABS %5, %6 ; abs(q0 - q1) +%if %0 == 8 + pmaxub %5, scratch2 ; accumulate hev +%else + pmaxub %5, %9 +%endif + pmaxub %1, %5 ; accumulate mask + + LF_ABS %3, %6 ; abs(p1 - q1) + LF_ABS %6, %7 ; abs(q1 - q2) + pmaxub %1, %6 ; accumulate mask + LF_ABS %7, %8 ; abs(q2 - q3) + pmaxub %1, %7 ; accumulate mask + + paddusb %4, %4 ; 2 * abs(p0 - q0) + pand %3, [GLOBAL(tfe)] + psrlw %3, 1 ; abs(p1 - q1) / 2 + paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + psubusb %1, [limit] + psubusb %4, [blimit] + por %1, %4 + pcmpeqb %1, zero ; mask + + psubusb %5, [thresh] + pcmpeqb %5, zero ; ~hev +%endmacro + +%macro LF_FILTER 6 + ; %1-%4: p1-q1 + ; %5: mask + ; %6: hev + + movdqa scratch2, %6 ; save hev + + pxor %1, [GLOBAL(t80)] ; ps1 + pxor %4, [GLOBAL(t80)] ; qs1 + movdqa scratch1, %1 + psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) + pandn scratch2, scratch1 ; vp8_filter &= hev + + pxor %2, [GLOBAL(t80)] ; ps0 + pxor %3, [GLOBAL(t80)] ; qs0 + movdqa scratch1, %3 + psubsb scratch1, %2 ; qs0 - ps0 + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + pand %5, scratch2 ; &= mask + + movdqa scratch2, %5 + paddsb %5, [GLOBAL(t4)] ; Filter1 + paddsb scratch2, [GLOBAL(t3)] ; Filter2 + + ; Filter1 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 3 + pand scratch1, [GLOBAL(te0)] + pand %5, [GLOBAL(t1f)] + por %5, scratch1 + + psubsb %3, %5 ; qs0 - Filter1 + pxor %3, [GLOBAL(t80)] + + ; Filter2 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, scratch2 + psrlw scratch2, 3 + pand scratch1, [GLOBAL(te0)] + pand scratch2, [GLOBAL(t1f)] + por scratch2, scratch1 + + paddsb %2, scratch2 ; ps0 + Filter2 + pxor %2, [GLOBAL(t80)] + + ; outer tap adjustments + paddsb %5, [GLOBAL(t1)] + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 1 + pand scratch1, [GLOBAL(t80)] + pand %5, [GLOBAL(t7f)] + por %5, scratch1 + pand %5, %6 ; vp8_filter &= ~hev + + psubsb %4, %5 ; qs1 - vp8_filter + pxor %4, [GLOBAL(t80)] + + paddsb %1, %5 ; ps1 + vp8_filter + pxor %1, [GLOBAL(t80)] +%endmacro + +;void vp8_loop_filter_bh_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) +global sym(vp8_loop_filter_bh_y_sse2) PRIVATE +sym(vp8_loop_filter_bh_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 11 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi ; src_ptr + %define stride rsi ; src_pixel_step + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define i0 [src] + %define i1 [spp] + %define i2 [src + 2 * stride] + %define i3 [spp + 2 * stride] + %define i4 [src + 4 * stride] + %define i5 [spp + 4 * stride] + %define i6 [src + 2 * stride3] + %define i7 [spp + 2 * stride3] + %define i8 [src + 8 * stride] + %define i9 [spp + 8 * stride] + %define i10 [src + 2 * stride5] + %define i11 [spp + 2 * stride5] + %define i12 [src + 4 * stride3] + %define i13 [spp + 4 * stride3] + %define i14 [src + 2 * stride7] + %define i15 [spp + 2 * stride7] + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + pxor zero, zero + + ; load the first set into registers + movdqa xmm0, i0 + movdqa xmm1, i1 + movdqa xmm2, i2 + movdqa xmm3, i3 + movdqa xmm4, i4 + movdqa xmm8, i5 + movdqa xmm9, i6 ; q2, will contain abs(p1-p0) + movdqa xmm10, i7 +LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm3, i4 + movdqa xmm8, i5 +LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm3 + movdqa i5, xmm8 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm4, i8 + movdqa xmm8, i9 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm4 + movdqa i9, xmm8 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm3, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm3, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm3 + movdqa i13, xmm8 + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + + +;void vp8_loop_filter_bv_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) + +global sym(vp8_loop_filter_bv_y_sse2) PRIVATE +sym(vp8_loop_filter_bv_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 15 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi + %define stride rsi + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define s0 [src] + %define s1 [spp] + %define s2 [src + 2 * stride] + %define s3 [spp + 2 * stride] + %define s4 [src + 4 * stride] + %define s5 [spp + 4 * stride] + %define s6 [src + 2 * stride3] + %define s7 [spp + 2 * stride3] + %define s8 [src + 8 * stride] + %define s9 [spp + 8 * stride] + %define s10 [src + 2 * stride5] + %define s11 [spp + 2 * stride5] + %define s12 [src + 4 * stride3] + %define s13 [spp + 4 * stride3] + %define s14 [src + 2 * stride7] + %define s15 [spp + 2 * stride7] + + %define i0 [rsp] + %define i1 [rsp + 16] + %define i2 [rsp + 32] + %define i3 [rsp + 48] + %define i4 [rsp + 64] + %define i5 [rsp + 80] + %define i6 [rsp + 96] + %define i7 [rsp + 112] + %define i8 [rsp + 128] + %define i9 [rsp + 144] + %define i10 [rsp + 160] + %define i11 [rsp + 176] + %define i12 [rsp + 192] + %define i13 [rsp + 208] + %define i14 [rsp + 224] + %define i15 [rsp + 240] + + ALIGN_STACK 16, rax + + ; reserve stack space + %define temp_storage 0 ; size is 256 (16*16) + %define stack_size 256 + sub rsp, stack_size + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + + ; 8-f + movdqa xmm0, s8 + movdqa xmm1, xmm0 + punpcklbw xmm0, s9 ; 80 90 + punpckhbw xmm1, s9 ; 88 98 + + movdqa xmm2, s10 + movdqa xmm3, xmm2 + punpcklbw xmm2, s11 ; a0 b0 + punpckhbw xmm3, s11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s12 + movdqa xmm5, xmm3 + punpcklbw xmm3, s13 ; c0 d0 + punpckhbw xmm5, s13 ; c8 d8 + + movdqa xmm6, s14 + movdqa xmm7, xmm6 + punpcklbw xmm6, s15 ; e0 f0 + punpckhbw xmm7, s15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i0, xmm0 + movdqa i1, xmm7 + movdqa i2, xmm4 + movdqa i3, xmm3 + movdqa i4, xmm1 + movdqa i5, xmm8 + movdqa i6, xmm2 + movdqa i7, xmm5 + + ; 0-7 + movdqa xmm0, s0 + movdqa xmm1, xmm0 + punpcklbw xmm0, s1 ; 00 10 + punpckhbw xmm1, s1 ; 08 18 + + movdqa xmm2, s2 + movdqa xmm3, xmm2 + punpcklbw xmm2, s3 ; 20 30 + punpckhbw xmm3, s3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s4 + movdqa xmm5, xmm3 + punpcklbw xmm3, s5 ; 40 50 + punpckhbw xmm5, s5 ; 48 58 + + movdqa xmm6, s6 + movdqa xmm7, xmm6 + punpcklbw xmm6, s7 ; 60 70 + punpckhbw xmm7, s7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i0 + punpckhqdq xmm6, i0 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i1 + punpckhqdq xmm9, i1 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i2 + punpckhqdq xmm10, i2 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i3 + punpckhqdq xmm11, i3 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i4 + punpckhqdq xmm12, i4 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i5 + punpckhqdq xmm13, i5 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i6 + punpckhqdq xmm14, i6 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i7 + punpckhqdq xmm15, i7 + + movdqa i0, xmm0 + movdqa i1, xmm6 + movdqa i2, xmm7 + movdqa i3, xmm9 + movdqa i4, xmm4 + movdqa i5, xmm10 + movdqa i6, xmm3 + movdqa i7, xmm11 + movdqa i8, xmm1 + movdqa i9, xmm12 + movdqa i10, xmm8 + movdqa i11, xmm13 + movdqa i12, xmm2 + movdqa i13, xmm14 + movdqa i14, xmm5 + movdqa i15, xmm15 + +; TRANSPOSED DATA AVAILABLE ON THE STACK + + movdqa xmm12, xmm6 + movdqa xmm13, xmm7 + + pxor zero, zero + +LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm8, i4 + movdqa xmm9, i5 +LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm8 + movdqa i5, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm3, i8 + movdqa xmm4, i9 +LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm3 + movdqa i9, xmm4 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm8, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm4, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm4 + movdqa i13, xmm8 + + +; RESHUFFLE AND WRITE OUT + ; 8-f + movdqa xmm0, i8 + movdqa xmm1, xmm0 + punpcklbw xmm0, i9 ; 80 90 + punpckhbw xmm1, i9 ; 88 98 + + movdqa xmm2, i10 + movdqa xmm3, xmm2 + punpcklbw xmm2, i11 ; a0 b0 + punpckhbw xmm3, i11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i12 + movdqa xmm5, xmm3 + punpcklbw xmm3, i13 ; c0 d0 + punpckhbw xmm5, i13 ; c8 d8 + + movdqa xmm6, i14 + movdqa xmm7, xmm6 + punpcklbw xmm6, i15 ; e0 f0 + punpckhbw xmm7, i15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i8, xmm0 + movdqa i9, xmm7 + movdqa i10, xmm4 + movdqa i11, xmm3 + movdqa i12, xmm1 + movdqa i13, xmm8 + movdqa i14, xmm2 + movdqa i15, xmm5 + + ; 0-7 + movdqa xmm0, i0 + movdqa xmm1, xmm0 + punpcklbw xmm0, i1 ; 00 10 + punpckhbw xmm1, i1 ; 08 18 + + movdqa xmm2, i2 + movdqa xmm3, xmm2 + punpcklbw xmm2, i3 ; 20 30 + punpckhbw xmm3, i3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i4 + movdqa xmm5, xmm3 + punpcklbw xmm3, i5 ; 40 50 + punpckhbw xmm5, i5 ; 48 58 + + movdqa xmm6, i6 + movdqa xmm7, xmm6 + punpcklbw xmm6, i7 ; 60 70 + punpckhbw xmm7, i7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i8 + punpckhqdq xmm6, i8 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i9 + punpckhqdq xmm9, i9 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i10 + punpckhqdq xmm10, i10 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i11 + punpckhqdq xmm11, i11 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i12 + punpckhqdq xmm12, i12 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i13 + punpckhqdq xmm13, i13 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i14 + punpckhqdq xmm14, i14 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i15 + punpckhqdq xmm15, i15 + + movdqa s0, xmm0 + movdqa s1, xmm6 + movdqa s2, xmm7 + movdqa s3, xmm9 + movdqa s4, xmm4 + movdqa s5, xmm10 + movdqa s6, xmm3 + movdqa s7, xmm11 + movdqa s8, xmm1 + movdqa s9, xmm12 + movdqa s10, xmm8 + movdqa s11, xmm13 + movdqa s12, xmm2 + movdqa s13, xmm14 + movdqa s14, xmm5 + movdqa s15, xmm15 + + ; free stack space + add rsp, stack_size + + ; un-ALIGN_STACK + pop rsp + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + +SECTION_RODATA +align 16 +te0: + times 16 db 0xe0 +align 16 +t7f: + times 16 db 0x7f +align 16 +tfe: + times 16 db 0xfe +align 16 +t1f: + times 16 db 0x1f +align 16 +t80: + times 16 db 0x80 +align 16 +t1: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 diff --git a/media/libvpx/vp8/common/x86/loopfilter_mmx.asm b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm new file mode 100644 index 000000000..88a07b9f3 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm @@ -0,0 +1,1753 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; transpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbh: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 + + + ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm3=q1, mm7 = limit + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm5 = p0 + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, mm0 ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + + ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, mm4=hev + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rsi+rax], mm6 + movq [rsi], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdi] + movq mm6, [rsi+rax*2] ; p1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi], mm3 + movq [rsi+rax*2], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + + movq mm6, [rdi+rax*4] + neg rax + movq mm3, [rdi+rax ] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi+rax ], mm3 + neg rax + movq [rdi+rax*4], mm6 + +;EARLY_BREAK_OUT: + neg rax + add rsi,8 + dec rcx + jnz .next8_mbh + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbv: + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ;transpose + movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 + + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + + movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 + + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + lea rdx, srct + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + + movq [rdx+56], mm7 + psubusb mm5, mm7 ; q2-q3 + + + movq [rdx+48], mm6 + psubusb mm7, mm6 ; q3-q2 + + por mm7, mm5; ; mm7=abs (q3-q2) + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + psubusb mm3, mm6 ; q1-q2 + + psubusb mm6, mm5 ; q2-q1 + por mm6, mm3 ; mm6=abs(q2-q1) + + movq [rdx+40], mm5 ; save q1 + movq [rdx+32], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq [rdx], mm0 ; save p3 + movq [rdx+8], mm1 ; save p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+24], mm3 ; save p0 + + movq [rdx+16], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 ; abs(q3-q2) > limit + + psubusb mm0, mm4 ; abs(p3-p2) > limit + psubusb mm1, mm4 ; abs(p2-p1) > limit + + psubusb mm6, mm4 ; abs(q2-q1) > limit + por mm7, mm6 ; or + + por mm0, mm1 ; + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+32] ; mm5=q0 + movq mm7, [rdx+40] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 ; abs(q1 - q0) > thresh + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 ; abs(p1 - p0)> thresh + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + + ; start work on filters + lea rdx, srct + + ; start work on filters + movq mm2, [rdx+16] ; p1 + movq mm7, [rdx+40] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + movq mm6, [rdx+24] ; p0 + movq mm0, [rdx+32] ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rdx+24], mm6 + movq [rdx+32], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdx + 40] + movq mm6, [rdx + 16] ; p1 + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdx + 40], mm3 + movq [rdx + 16], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm6, [rdx+ 8] + movq mm3, [rdx+48] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 + pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 + + ; transpose and write back + movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 + movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 + + punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 + punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 + + movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 + movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 + + punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 + punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 + + movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 + punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 + + punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 + movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 + + punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 + punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 + + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 + + movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 + punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 + + movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 + punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 + + punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 + movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 + + punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 + punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 + + movq [rsi+rax*4], mm0 ; write out + movq [rdi+rax*4], mm6 ; write out + + movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 + punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 + + punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 + movq [rsi+rax*2], mm0 ; write out + + movq [rdi+rax*2], mm5 ; write out + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + + punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 + punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 + + movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 + punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 + + punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 + movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 + + movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 + punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 + + punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 + movq [rsi], mm0 ; write out + + movq [rdi], mm1 ; write out + neg rax + + punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 + punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 + + movq [rsi+rax*2], mm3 + movq [rdi+rax*2], mm4 + + lea rsi, [rsi+rax*8] + dec rcx + + jnz .next8_mbv + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rcx, 2 ; count +.nexts8_h: + mov rdx, arg(2) ;blimit ; get blimit + movq mm3, [rdx] ; + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movq mm1, [rsi+2*rax] ; p1 + movq mm0, [rdi] ; q1 + movq mm2, mm1 + movq mm7, mm0 + movq mm4, mm0 + psubusb mm0, mm1 ; q1-=p1 + psubusb mm1, mm4 ; p1-=q1 + por mm1, mm0 ; abs(p1-q1) + pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm1, 1 ; abs(p1-q1)/2 + + movq mm5, [rsi+rax] ; p0 + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + movq mm6, mm5 ; p0 + psubusb mm5, mm4 ; p0-=q0 + psubusb mm4, mm6 ; q0-=p0 + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm3, mm3 + pcmpeqb mm5, mm3 + + ; start work on filters + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) + pand mm5, mm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + movq mm1, mm5 ; get a copy of filters + psraw mm1, 11 ; arithmetic shift right 11 + psllw mm1, 8 ; shift left 8 to put it back + + por mm0, mm1 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .nexts8_h + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4- 2]; ; + mov rcx, 2 ; count +.nexts8_v: + + lea rdi, [rsi + rax]; + movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 + + movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + + movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 + movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 + + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + movq mm5, mm4 ; 53 43 52 42 51 41 50 40 + + punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 + punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 + + neg rax + + movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 + movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 + + punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 + movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 + + movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 + punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 + + movq mm2, mm0 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 13 03 12 02 11 01 10 00 + + punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 + movq mm3, mm2 ; 33 23 13 03 32 22 12 02 + + punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 + punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 + + punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 + + + ; calculate mask + movq mm6, mm0 ; p1 + movq mm7, mm3 ; q1 + psubusb mm7, mm6 ; q1-=p1 + psubusb mm6, mm3 ; p1-=q1 + por mm6, mm7 ; abs(p1-q1) + pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm6, 1 ; abs(p1-q1)/2 + + movq mm5, mm1 ; p0 + movq mm4, mm2 ; q0 + + psubusb mm5, mm2 ; p0-=q0 + psubusb mm4, mm1 ; q0-=p0 + + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] + + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm7, mm7 + pcmpeqb mm5, mm7 ; mm5 = mask + + ; start work on filters + movq t0, mm0 + movq t1, mm3 + + pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm0, mm3 ; p1 - q1 + movq mm6, mm1 ; p0 + + movq mm7, mm2 ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + + pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm7 ; offseted ; q0 + + psubsb mm7, mm6 ; q0 - p0 + paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) + + pand mm5, mm0 ; mask filter values we don't care about + + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + movq mm7, mm5 ; get a copy of filters + psraw mm7, 11 ; arithmetic shift right 11 + psllw mm7, 8 ; shift left 8 to put it back + + por mm0, mm7 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0sz add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + + movq mm0, t0 + movq mm4, t1 + + ; mm0 = 70 60 50 40 30 20 10 00 + ; mm6 = 71 61 51 41 31 21 11 01 + ; mm3 = 72 62 52 42 32 22 12 02 + ; mm4 = 73 63 53 43 33 23 13 03 + ; transpose back to write out + + movq mm1, mm0 ; + punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 + + punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 + movq mm2, mm3 ; + + punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 + movq mm5, mm1 ; 71 70 61 60 51 50 41 40 + + punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 + movq mm6, mm0 ; 31 30 21 20 11 10 01 00 + + punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 + punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 + + movd [rsi+rax*4], mm0 ; write 03 02 01 00 + punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 + + psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 + punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 + + movd [rdi+rax*4], mm0 ; write 13 12 11 10 + movd [rsi+rax*2], mm6 ; write 23 22 21 20 + + psrlq mm6, 32 ; 33 32 31 30 + movd [rsi], mm1 ; write 43 42 41 40 + + movd [rsi + rax], mm6 ; write 33 32 31 30 + neg rax + + movd [rsi + rax*2], mm5 ; write 63 62 61 60 + psrlq mm1, 32 ; 53 52 51 50 + + movd [rdi], mm1 ; write out 53 52 51 50 + psrlq mm5, 32 ; 73 72 71 70 + + movd [rdi + rax*2], mm5 ; write 73 72 71 70 + + lea rsi, [rsi+rax*8] ; next 8 + + dec rcx + jnz .nexts8_v + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, +; int y_stride, +; loop_filter_info *lfi) +;{ +; +; +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +;} + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/media/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm new file mode 100644 index 000000000..1913abc69 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -0,0 +1,1640 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +%define _t0 0 +%define _t1 _t0 + 16 +%define _p3 _t1 + 16 +%define _p2 _p3 + 16 +%define _p1 _p2 + 16 +%define _p0 _p1 + 16 +%define _q0 _p0 + 16 +%define _q1 _q0 + 16 +%define _q2 _q1 + 16 +%define _q3 _q2 + 16 +%define lf_var_size 160 + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa [rsp+_q2], xmm1 ; store q2 + movdqa [rsp+_q1], xmm4 ; store q1 +%endif + movdqa xmm7, [rdx] ;limit + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa [rsp+_t0], xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa [rsp+_p2], xmm4 ; store p2 + movdqa [rsp+_p1], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, [rsp+_q1] ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa [rsp+_t1], xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, [rdx] ; blimit + mov rdx, arg(4) ; hev get thresh + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) + movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, xmm3 ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, xmm3 ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + + paddsb xmm6, xmm2 ; p0+= p0 add + + movdqa xmm2, [GLOBAL(ones)] + paddsw xmm5, xmm2 + paddsw xmm1, xmm2 + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm2, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_p1] ; p1 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rsp+_p1] ; p1 +%endif + + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, xmm2 ; unoffset + + pxor xmm1, xmm2 ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, xmm2 ; unoffset + + pxor xmm1, xmm2 ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, xmm2 ; unoffset +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rax], xmm1 ; p1 + movhps [rdi + rax], xmm1 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + movq [rsi + rcx*2], xmm7 ; q1 + movhps [rdi + rcx*2], xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro MB_FILTER_AND_WRITEBACK 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 + + mov rcx, rax + neg rcx +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 ; vp8_filter + + pand xmm2, xmm4 ; Filter2 = vp8_filter & hev + pxor xmm0, xmm0 + + pandn xmm4, xmm1 ; vp8_filter&=~hev + pxor xmm1, xmm1 + + punpcklbw xmm0, xmm4 ; Filter 2 (hi) + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + + movdqa xmm5, xmm2 + + movdqa xmm4, [GLOBAL(s9)] + paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) + paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + + pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 + pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 + + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx + + psraw xmm7, 11 ; sign extended shift right by 3 + + psraw xmm5, 11 ; sign extended shift right by 3 + punpckhbw xmm4, xmm2 ; axbxcxdx + + punpcklbw xmm2, xmm2 ; exfxgxhx + psraw xmm4, 11 ; sign extended shift right by 3 + + packsswb xmm5, xmm7 ; Filter2 >>=3; + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm4 ; Filter1 >>=3; + + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 + movdqa xmm7, xmm1 + + movdqa xmm4, [GLOBAL(s63)] + movdqa xmm5, xmm0 + movdqa xmm2, xmm5 + paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 + paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 + movdqa xmm4, xmm7 + + paddw xmm5, xmm5 ; Filter 2 (hi) * 18 + + paddw xmm7, xmm7 ; Filter 2 (lo) * 18 + paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 + + paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 + paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 + psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 + + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 + psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 + psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 + + packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 + psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 + psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 + + packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + movdqa xmm7, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_q1] ; q1 + movdqa xmm4, [rsp+_p1] ; p1 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] + +%elif %1 == 1 + movdqa xmm1, [rdi] ; q1 + movdqa xmm4, [rsi+rax*2] ; p1 +%elif %1 == 2 + movdqa xmm4, [rsp+_p1] ; p1 + movdqa xmm1, [rsp+_q1] ; q1 +%endif + + pxor xmm1, xmm7 + pxor xmm4, xmm7 + + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) + paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) + +%if %1 == 1 + movdqa xmm2, [rdi+rax*4] ; p2 + movdqa xmm5, [rdi+rcx] ; q2 +%else + movdqa xmm2, [rsp+_p2] ; p2 + movdqa xmm5, [rsp+_q2] ; q2 +%endif + + pxor xmm1, xmm7 ; *oq1 = sq^0x80; + pxor xmm4, xmm7 ; *op1 = sp^0x80; + pxor xmm2, xmm7 + pxor xmm5, xmm7 + paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) + pxor xmm2, xmm7 ; *op2 = sp^0x80; + pxor xmm5, xmm7 ; *oq2 = sq^0x80; + pxor xmm3, xmm7 ; *oq0 = sq^0x80 + pxor xmm6, xmm7 ; *oq0 = sp^0x80 +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + lea rdx, [rcx + rcx*2] + movq [rsi+rcx*2], xmm1 ; q1 + movhps [rdi+rcx*2], xmm1 + + movq [rsi + rax], xmm4 ; p1 + movhps [rdi + rax], xmm4 + + movq [rsi+rax*2], xmm2 ; p2 + movhps [rdi+rax*2], xmm2 + + movq [rsi+rdx], xmm5 ; q2 + movhps [rdi+rdx], xmm5 +%elif %1 == 1 + movdqa [rdi+rcx], xmm5 ; q2 + movdqa [rdi], xmm1 ; q1 + movdqa [rsi], xmm3 ; q0 + movdqa [rsi+rax ], xmm6 ; p0 + movdqa [rsi+rax*2], xmm4 ; p1 + movdqa [rdi+rax*4], xmm2 ; p2 +%elif %1 == 2 + movdqa [rsp+_p1], xmm4 ; p1 + movdqa [rsp+_p0], xmm6 ; p0 + movdqa [rsp+_q0], xmm3 ; q0 + movdqa [rsp+_q1], xmm1 ; q1 +%endif + +%endmacro + + +;void vp8_mbloop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + +%if %1 == 0 + lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing + lea rsi, [rsi - 4] +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa [rsp+_t0], xmm2 ; save to free XMM2 + + movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 + +%if %2 == 0 + movdqa [rsp+_q3], xmm7 ; save 7 + movdqa [rsp+_q2], xmm6 ; save 6 +%endif + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa [rsp+_p1], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + movdqa [rsp+_p0], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rsp+_q0], xmm4 ; save 4 + movdqa [rsp+_q1], xmm5 ; save 5 + movdqa xmm1, [rsp+_t0] + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + +%if %2 == 0 + movdqa [rsp+_p2], xmm1 + movdqa [rsp+_p3], xmm2 +%endif + +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 0 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) + + movdqa xmm5, [rsp+_p1] ; p1 + pmaxub xmm0, xmm7 + + movdqa xmm2, xmm5 ; p1 + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + + por xmm2, xmm7 ; abs(p1-p0) + + pmaxub xmm0, xmm2 + + movdqa xmm5, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 + + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm4, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + pmaxub xmm0, xmm7 + + psubusb xmm0, [rdx] ; limit + + mov rdx, arg(2) ; blimit + movdqa xmm5, xmm4 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + movdqa xmm4, [rdx] ; blimit + mov rdx, arg(4) ; get thresh + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + por xmm1, xmm6 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + movdqa xmm3, [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh + + psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm2, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm2 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + movd [rsi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2], %1 + movd [rdi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rsi+2*rax+2], %1 + movd [rsi+2*rcx+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2*rax+2], %1 + movd [rdi+2*rcx+2], %2 +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%macro MBV_TRANSPOSE 0 + movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + + movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + + punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 +%endmacro + +%macro MBV_WRITEBACK_1 0 + movq [rsi], xmm0 + movhps [rdi], xmm0 + + movq [rsi+2*rax], xmm6 + movhps [rdi+2*rax], xmm6 + + movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + + movq [rsi+4*rax], xmm0 + movhps [rdi+4*rax], xmm0 + + movq [rsi+2*rcx], xmm3 + movhps [rdi+2*rcx], xmm3 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + + movdqa xmm0, xmm7 + punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + + movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 + punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 +%endmacro + +%macro MBV_WRITEBACK_2 0 + movq [rsi], xmm1 + movhps [rdi], xmm1 + + movq [rsi+2*rax], xmm5 + movhps [rdi+2*rax], xmm5 + + movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + + movq [rsi+4*rax], xmm1 + movhps [rdi+4*rax], xmm1 + + movq [rsi+2*rcx], xmm4 + movhps [rdi+2*rcx], xmm4 +%endmacro + + +;void vp8_mbloop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ; Transpose + TRANSPOSE_16X8 1, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + neg rax + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + ; transpose and write back + MBV_TRANSPOSE + + neg rax + + MBV_WRITEBACK_1 + + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ; Transpose + TRANSPOSE_16X8 0, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + ; transpose and write back + MBV_TRANSPOSE + + mov rsi, arg(0) ;u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_1 + mov rsi, arg(5) ;v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + ; end prolog + + mov rcx, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + movdqa xmm6, [GLOBAL(tfe)] + lea rdx, [rcx + rax] + neg rax + + ; calculate mask + movdqa xmm0, [rdx] ; q1 + mov rdx, arg(2) ;blimit + movdqa xmm1, [rcx+2*rax] ; p1 + + movdqa xmm2, xmm1 + movdqa xmm3, xmm0 + + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm3 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, xmm6 ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqa xmm7, XMMWORD PTR [rdx] + + movdqa xmm5, [rcx+rax] ; p0 + movdqa xmm4, [rcx] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + + movdqa xmm4, [GLOBAL(t80)] + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 + + + ; start work on filters + pxor xmm2, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm2, xmm3 ; p1 - q1 + + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm0, xmm4 ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset + movdqa [rcx], xmm3 ; write back + + pxor xmm6, xmm4 ; unoffset + movdqa [rcx+rax], xmm6 ; write back + + ; begin epilog + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm7, xmm4 + punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + mov rdx, arg(2) ;blimit + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm7, [rdx] + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm4, [GLOBAL(t80)] + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm0, xmm3 ; p1 - q1 + + pxor xmm1, xmm4 ; offset to convert to signed values + pxor xmm2, xmm4 ; offset to convert to signed values + + movdqa xmm3, xmm2 ; offseted ; q0 + psubsb xmm2, xmm1 ; q0 - p0 + paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm0 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm6, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm1, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset q0 + pxor xmm1, xmm4 ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm6, xmm0 + punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm6 + punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movd [rsi], xmm6 ; write the second 8-line result + movd [rdx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi], xmm6 + movd [rcx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rsi + rax*2], xmm6 + movd [rdx + rax*2], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi + rax*2], xmm6 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + movd [rdx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi], xmm0 + movd [rcx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rsi + rax*2], xmm0 + movd [rdx + rax*2], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi + rax*2], xmm0 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f +align 16 +te0: + times 16 db 0xe0 +align 16 +t1f: + times 16 db 0x1f diff --git a/media/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/vp8/common/x86/loopfilter_x86.c new file mode 100644 index 000000000..658600460 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_x86.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8/common/loopfilter.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_nc(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh) + +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); +prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); + +#if HAVE_SSE2 && ARCH_X86_64 +prototype_loopfilter(vp8_loop_filter_bv_y_sse2); +prototype_loopfilter(vp8_loop_filter_bh_y_sse2); +#else +prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); +#endif +prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; + +#if HAVE_MMX +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); +} +#endif + + +/* Horizontal MB filtering */ +#if HAVE_SSE2 +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); +} + + +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); +} + + +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); +} + +#endif diff --git a/media/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/vp8/common/x86/mfqe_sse2.asm new file mode 100644 index 000000000..a8a7f568d --- /dev/null +++ b/media/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -0,0 +1,287 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight16x16_sse2) PRIVATE +sym(vp8_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight8x8_sse2) PRIVATE +sym(vp8_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE +sym(vp8_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 + diff --git a/media/libvpx/vp8/common/x86/postproc_mmx.asm b/media/libvpx/vp8/common/x86/postproc_mmx.asm new file mode 100644 index 000000000..a2b16327f --- /dev/null +++ b/media/libvpx/vp8/common/x86/postproc_mmx.asm @@ -0,0 +1,315 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + +;void vp8_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_mmx) PRIVATE +sym(vp8_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp8_rv))] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c<cols; c+=4) +.loop_col: + mov rsi, arg(0) ;s + pxor mm0, mm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + + ; this copies the last row down into the border 8 rows + mov rdi, rsi + mov rdx, arg(2) + sub rdx, 9 + imul rdx, rax + lea rdi, [rdi+rdx] + movq mm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_borderd ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], mm1 + + dec rcx + jne .init_borderd + + neg rax ; rax = -pitch + + ; this copies the first row up into the border 8 rows + mov rdi, rsi + movq mm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_border ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], mm1 + + dec rcx + jne .init_border + + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor mm5, mm5 + pxor mm6, mm6 ; + + pxor mm7, mm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movd mm1, DWORD PTR [rdi]; + punpcklbw mm1, mm0 ; + + paddw mm5, mm1 ; + pmullw mm1, mm1 ; + + movq mm2, mm1 ; + punpcklwd mm1, mm0 ; + + punpckhwd mm2, mm0 ; + paddd mm6, mm1 ; + + paddd mm7, mm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw mm1, mm0 + punpcklbw mm2, mm0 + + paddw mm5, mm2 + psubw mm5, mm1 + + pmullw mm2, mm2 + movq mm4, mm2 + + punpcklwd mm2, mm0 + punpckhwd mm4, mm0 + + paddd mm6, mm2 + paddd mm7, mm4 + + pmullw mm1, mm1 + movq mm2, mm1 + + punpcklwd mm1, mm0 + psubd mm6, mm1 + + punpckhwd mm2, mm0 + psubd mm7, mm2 + + + movq mm3, mm6 + pslld mm3, 4 + + psubd mm3, mm6 + movq mm1, mm5 + + movq mm4, mm5 + pmullw mm1, mm1 + + pmulhw mm4, mm4 + movq mm2, mm1 + + punpcklwd mm1, mm4 + punpckhwd mm2, mm4 + + movq mm4, mm7 + pslld mm4, 4 + + psubd mm4, mm7 + + psubd mm3, mm1 + psubd mm4, mm2 + + psubd mm3, flimit2 + psubd mm4, flimit2 + + psrad mm3, 31 + psrad mm4, 31 + + packssdw mm3, mm4 + packsswb mm3, mm0 + + movd mm1, DWORD PTR [rsi+rax*8] + + movq mm2, mm1 + punpcklbw mm1, mm0 + + paddw mm1, mm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp8_rv))] + movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movq mm4, [sym(vp8_rv) + rcx*2] +%endif + paddw mm1, mm4 + psraw mm1, 4 + + packuswb mm1, mm0 + pand mm1, mm3 + + pandn mm3, mm2 + por mm1, mm3 + + and rcx, 15 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] + + cmp edx, 8 + jl .skip_assignment + + mov rcx, rdx + sub rcx, 8 + and rcx, 15 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] + movd [rsi], mm1 + +.skip_assignment + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + + add dword arg(0), 4 ; s += 4 + sub dword arg(3), 4 ; cols -= 4 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 136 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit2 + + +;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vp8_plane_add_noise_mmx) PRIVATE +sym(vp8_plane_add_noise_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movq mm1,[rsi+rax] ; get the source + + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb mm1, [rdx+32] ;bothclamp + psubusb mm1, [rdx+16] ;whiteclamp + + movq mm2,[rdi+rax] ; get the noise for this line + paddb mm1,mm2 ; add it in + movq [rsi+rax],mm1 ; store the result + + add rax,8 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +Blur: + times 16 dw 16 + times 8 dw 64 + times 16 dw 16 + times 8 dw 0 + +rd: + times 4 dw 0x40 diff --git a/media/libvpx/vp8/common/x86/postproc_sse2.asm b/media/libvpx/vp8/common/x86/postproc_sse2.asm new file mode 100644 index 000000000..fed4ee5cc --- /dev/null +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm @@ -0,0 +1,723 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;macro in deblock functions +%macro FIRST_2_ROWS 0 + movdqa xmm4, xmm0 + movdqa xmm6, xmm0 + movdqa xmm5, xmm1 + pavgb xmm5, xmm3 + + ;calculate absolute value + psubusb xmm4, xmm1 + psubusb xmm1, xmm0 + psubusb xmm6, xmm3 + psubusb xmm3, xmm0 + paddusb xmm4, xmm1 + paddusb xmm6, xmm3 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm7, xmm2 + + ;get mask + psubusb xmm2, xmm4 + psubusb xmm7, xmm6 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm7, xmm1 + por xmm7, xmm2 +%endmacro + +%macro SECOND_2_ROWS 0 + movdqa xmm6, xmm0 + movdqa xmm4, xmm0 + movdqa xmm2, xmm1 + pavgb xmm1, xmm3 + + ;calculate absolute value + psubusb xmm6, xmm2 + psubusb xmm2, xmm0 + psubusb xmm4, xmm3 + psubusb xmm3, xmm0 + paddusb xmm6, xmm2 + paddusb xmm4, xmm3 + + pavgb xmm5, xmm1 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm3, xmm2 + + ;get mask + psubusb xmm2, xmm6 + psubusb xmm3, xmm4 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm3, xmm1 + + por xmm7, xmm2 + por xmm7, xmm3 + + pavgb xmm5, xmm0 + + ;decide if or not to use filtered value + pand xmm0, xmm7 + pandn xmm7, xmm5 + paddusb xmm0, xmm7 +%endmacro + +%macro UPDATE_FLIMIT 0 + movdqa xmm2, XMMWORD PTR [rbx] + movdqa [rsp], xmm2 + add rbx, 16 +%endmacro + +;void vp8_post_proc_down_and_across_mb_row_sse2 +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int cols, +; int *flimits, +; int size +;) +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE +sym(vp8_post_proc_down_and_across_mb_row_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 + + ; put flimit on stack + mov rbx, arg(5) ;flimits ptr + UPDATE_FLIMIT + +%define flimit [rsp] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock +.nextrow: + xor rdx, rdx ;col +.nextcol: + ;load current and next 2 rows + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] + + FIRST_2_ROWS + + ;load above 2 rows + neg rax + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] + movdqu xmm3, XMMWORD PTR [rsi + rax] + + SECOND_2_ROWS + + movdqu XMMWORD PTR [rdi], xmm0 + + neg rax ; positive stride + add rsi, 16 + add rdi, 16 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .downdone + UPDATE_FLIMIT + jmp .nextcol + +.downdone: + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + mov rbx, arg(5) ; flimits + UPDATE_FLIMIT + + ; dup the first byte into the left border 8 times + movq mm1, [rdi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + mov rdx, -8 + movq [rdi+rdx], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(4) + movq mm1, [rdi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rdi+rdx], mm1 + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-16]; + movq mm1, QWORD PTR [rdi-8]; + +.acrossnextcol: + movdqu xmm0, XMMWORD PTR [rdi + rdx] + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] + + FIRST_2_ROWS + + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] + + SECOND_2_ROWS + + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes + movdq2q mm0, xmm0 + psrldq xmm0, 8 + movdq2q mm1, xmm0 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .acrossdone + UPDATE_FLIMIT + jmp .acrossnextcol + +.acrossdone + ; last 16 pixels + movq QWORD PTR [rdi+rdx-16], mm0 + + cmp edx, dword arg(4) + jne .throw_last_8 + movq QWORD PTR [rdi+rdx-8], mm1 +.throw_last_8: + ; done with this rwo + add rsi,rax ;next src line + mov eax, dword arg(3) ;dst_pixels_per_line + add rdi,rax ;next destination + mov eax, dword arg(2) ;src_pixels_per_line + + mov rbx, arg(5) ;flimits + UPDATE_FLIMIT + + dec rcx ;decrement count + jnz .nextrow ;next row + + add rsp, 16 + pop rsp + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit + +;void vp8_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_xmm) PRIVATE +sym(vp8_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp8_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c<cols; c+=8) +.loop_col: + mov rsi, arg(0) ; s + pxor xmm0, xmm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + + ; this copies the last row down into the border 8 rows + mov rdi, rsi + mov rdx, arg(2) + sub rdx, 9 + imul rdx, rax + lea rdi, [rdi+rdx] + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_borderd ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_borderd + + neg rax ; rax = -pitch + + ; this copies the first row up into the border 8 rows + mov rdi, rsi + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_border ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_border + + + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movq xmm1, QWORD PTR [rdi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [rsi+rax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp8_rv))] + movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movdqu xmm4, [sym(vp8_rv) + rcx*2] +%endif + + paddw xmm1, xmm4 + ;paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and rcx, 15 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + + cmp edx, 8 + jl .skip_assignment + + mov rcx, rdx + sub rcx, 8 + and rcx, 15 + movq mm0, [rsp + rcx*8] ;d[rcx*8] + movq [rsi], mm0 + +.skip_assignment + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + add dword arg(0), 8 ; s += 8 + sub dword arg(3), 8 ; cols -= 8 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 128+16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE +sym(vp8_mbpost_proc_across_ip_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r<rows;r++) +.ip_row_loop: + + xor rdx, rdx ;sumsq=0; + xor rcx, rcx ;sum=0; + mov rsi, arg(0); s + + + ; dup the first byte into the left border 8 times + movq mm1, [rsi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + + mov rdi, -8 + movq [rsi+rdi], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(3) + movq mm1, [rsi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rsi+rdx], mm1 + +.ip_var_loop: + ;for(i=-8;i<=6;i++) + ;{ + ; sumsq += s[i]*s[i]; + ; sum += s[i]; + ;} + movzx eax, byte [rsi+rdi] + add ecx, eax + mul al + add edx, eax + add rdi, 1 + cmp rdi, 6 + jle .ip_var_loop + + + ;mov rax, sumsq + ;movd xmm7, rax + movd xmm7, edx + + ;mov rax, sum + ;movd xmm6, rax + movd xmm6, ecx + + mov rsi, arg(0) ;s + xor rcx, rcx + + movsxd rdx, dword arg(3) ;cols + add rdx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 +.nextcol4: + + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 ; expanding + punpcklbw xmm2, xmm0 ; expanding + + punpcklwd xmm1, xmm0 ; expanding to dwords + punpcklwd xmm2, xmm0 ; expanding to dwords + + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 ; duplicate the last ones + pshufd xmm7, xmm7, 0 ; duplicate the last ones + + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [rsi+rcx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, [GLOBAL(four8s)] + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [rsi+rcx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add rcx, 4 + + cmp rcx, rdx + jl .nextcol4 + + ;s+=pitch; + movsxd rax, dword arg(1) + add arg(0), rax + + sub dword arg(2), 1 ;rows-=1 + cmp dword arg(2), 0 + jg .ip_row_loop + + add rsp, 16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vp8_plane_add_noise_wmt) PRIVATE +sym(vp8_plane_add_noise_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +four8s: + times 4 dd 8 diff --git a/media/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/vp8/common/x86/recon_mmx.asm new file mode 100644 index 000000000..15e98713c --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_mmx.asm @@ -0,0 +1,274 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x8_mmx) PRIVATE +sym(vp8_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x4_mmx) PRIVATE +sym(vp8_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_mmx) PRIVATE +sym(vp8_copy_mem16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movsxd rax, dword ptr arg(1) ;src_stride; + + mov rdi, arg(2) ;dst; + movsxd rcx, dword ptr arg(3) ;dst_stride + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq [rdi], mm0 + movq [rdi+8], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/vp8/common/x86/recon_sse2.asm new file mode 100644 index 000000000..7141f8324 --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_sse2.asm @@ -0,0 +1,1115 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_sse2) PRIVATE +sym(vp8_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_intra_pred_uv_dc_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dc_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ; from top + mov rdi, arg(2) ;above; + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + pxor mm0, mm0 + movq mm1, [rdi] + lea rdi, [rax*3] + psadbw mm1, mm0 + ; from left + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax*1] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + + movzx edx, byte [rsi+rdi] + lea rsi, [rsi+rax*4] + add ecx, edx + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + + ; add up + pextrw edx, mm1, 0x0 + lea edx, [edx+ecx+8] + sar edx, 4 + movd mm1, edx + movsxd rcx, dword ptr arg(1) ;dst_stride + pshufw mm1, mm1, 0x0 + mov rdi, arg(0) ;dst; + packuswb mm1, mm1 + + ; write out + lea rax, [rcx*3] + lea rdx, [rdi+rcx*4] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + movq [rdx ], mm1 + movq [rdx+rcx ], mm1 + movq [rdx+rcx*2], mm1 + movq [rdx+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dctop_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dctop_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;arg(3), arg(4) not used + + ; from top + mov rsi, arg(2) ;above; + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; add up + paddw mm1, [GLOBAL(dc_4)] + psraw mm1, 3 + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dcleft_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dcleft_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;arg(2) not used + + ; from left + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+4] + + ; add up + shr edx, 3 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dc128_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE +sym(vp8_intra_pred_uv_dc128_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + ;arg(2), arg(3), arg(4) not used + + ; write out + movq mm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +%macro vp8_intra_pred_uv_tm 1 +global sym(vp8_intra_pred_uv_tm_%1) PRIVATE +sym(vp8_intra_pred_uv_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + ; read top row + mov edx, 4 + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm2, [GLOBAL(dc_1024)] +%endif + movq xmm1, [rsi] + punpcklbw xmm1, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm3, [rsi-1] + mov rsi, arg(3) ;left; +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + pshuflw xmm3, xmm3, 0x0 + punpcklqdq xmm3, xmm3 +%else + pshufb xmm3, xmm2 +%endif + psubw xmm1, xmm3 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +.vp8_intra_pred_uv_tm_%1_loop: + mov bl, [rsi] + movd xmm3, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm3, xmm3, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm3, xmm3 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm3, xmm2 + pshufb xmm5, xmm2 +%endif + paddw xmm3, xmm1 + paddw xmm5, xmm1 + packuswb xmm3, xmm5 + movq [rdi ], xmm3 + movhps[rdi+rcx], xmm3 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp8_intra_pred_uv_tm_%1_loop + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_uv_tm sse2 +vp8_intra_pred_uv_tm ssse3 + +;void vp8_intra_pred_uv_ve_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE +sym(vp8_intra_pred_uv_ve_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + ; end prolog + + ; arg(3), arg(4) not used + + ; read from top + mov rax, arg(2) ;src; + + movq mm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_ho_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +%macro vp8_intra_pred_uv_ho 1 +global sym(vp8_intra_pred_uv_ho_%1) PRIVATE +sym(vp8_intra_pred_uv_ho_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx +%ifidn %1, ssse3 + GET_GOT rbx +%endif + ; end prolog + + ;arg(2) not used + + ; read from left and write out +%ifidn %1, mmx2 + mov edx, 4 +%endif + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride +%ifidn %1, ssse3 + lea rdx, [rcx*3] + movdqa xmm2, [GLOBAL(dc_00001111)] +%endif + +%ifidn %1, mmx2 +.vp8_intra_pred_uv_ho_%1_loop: + mov bl, [rsi] + movd mm0, ebx + + mov bl, [rsi+rax] + movd mm1, ebx + + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + pshufw mm0, mm0, 0x0 + pshufw mm1, mm1, 0x0 + movq [rdi ], mm0 + movq [rdi+rcx], mm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp8_intra_pred_uv_ho_%1_loop +%else + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 + lea rsi, [rsi+rax*4] + lea rdi, [rdi+rcx*4] + + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 +%endif + + ; begin epilog +%ifidn %1, ssse3 + RESTORE_GOT +%endif + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_uv_ho mmx2 +vp8_intra_pred_uv_ho ssse3 + +;void vp8_intra_pred_y_dc_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dc_sse2) PRIVATE +sym(vp8_intra_pred_y_dc_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ; from top + mov rdi, arg(2) ;above + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + + pxor xmm0, xmm0 + movdqa xmm1, [rdi] + psadbw xmm1, xmm0 + movq xmm2, xmm1 + punpckhqdq xmm1, xmm1 + paddw xmm1, xmm2 + + ; from left + lea rdi, [rax*3] + + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + + ; add up + pextrw edx, xmm1, 0x0 + lea edx, [edx+ecx+16] + sar edx, 5 + movd xmm1, edx + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + dec rsi + jnz .label + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dctop_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE +sym(vp8_intra_pred_y_dctop_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + GET_GOT rbx + ; end prolog + + ;arg(3), arg(4) not used + + ; from top + mov rcx, arg(2) ;above; + pxor xmm0, xmm0 + movdqa xmm1, [rcx] + psadbw xmm1, xmm0 + movdqa xmm2, xmm1 + punpckhqdq xmm1, xmm1 + paddw xmm1, xmm2 + + ; add up + paddw xmm1, [GLOBAL(dc_8)] + psraw xmm1, 4 + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdx, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdx ], xmm1 + movdqa [rdx+rcx ], xmm1 + movdqa [rdx+rcx*2], xmm1 + movdqa [rdx+rax ], xmm1 + lea rdx, [rdx+rcx*4] + movdqa [rdx ], xmm1 + movdqa [rdx+rcx ], xmm1 + movdqa [rdx+rcx*2], xmm1 + movdqa [rdx+rax ], xmm1 + lea rdx, [rdx+rcx*4] + dec rsi + jnz .label + + ; begin epilog + RESTORE_GOT + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dcleft_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE +sym(vp8_intra_pred_y_dcleft_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;arg(2) not used + + ; from left + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+8] + + ; add up + shr edx, 4 + movd xmm1, edx + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + dec rsi + jnz .label + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dc128_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE +sym(vp8_intra_pred_y_dc128_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + GET_GOT rbx + ; end prolog + + ;arg(2), arg(3), arg(4) not used + + ; write out + mov rsi, 2 + movdqa xmm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + +.label + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + dec rsi + jnz .label + + ; begin epilog + RESTORE_GOT + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +%macro vp8_intra_pred_y_tm 1 +global sym(vp8_intra_pred_y_tm_%1) PRIVATE +sym(vp8_intra_pred_y_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rbx + GET_GOT rbx + ; end prolog + + ; read top row + mov edx, 8 + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm3, [GLOBAL(dc_1024)] +%endif + movdqa xmm1, [rsi] + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm4, [rsi-1] + mov rsi, arg(3) ;left +%ifidn %1, sse2 + punpcklbw xmm4, xmm0 + pshuflw xmm4, xmm4, 0x0 + punpcklqdq xmm4, xmm4 +%else + pshufb xmm4, xmm3 +%endif + psubw xmm1, xmm4 + psubw xmm2, xmm4 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride +vp8_intra_pred_y_tm_%1_loop: + mov bl, [rsi] + movd xmm4, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx +%ifidn %1, sse2 + punpcklbw xmm4, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm4, xmm4, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm4, xmm4 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm4, xmm3 + pshufb xmm5, xmm3 +%endif + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + paddw xmm4, xmm1 + paddw xmm6, xmm2 + paddw xmm5, xmm1 + paddw xmm7, xmm2 + packuswb xmm4, xmm6 + packuswb xmm5, xmm7 + movdqa [rdi ], xmm4 + movdqa [rdi+rcx], xmm5 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_y_tm_%1_loop + + ; begin epilog + RESTORE_GOT + pop rbx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_y_tm sse2 +vp8_intra_pred_y_tm ssse3 + +;void vp8_intra_pred_y_ve_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_ve_sse2) PRIVATE +sym(vp8_intra_pred_y_ve_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + ; end prolog + + ;arg(3), arg(4) not used + + mov rax, arg(2) ;above; + mov rsi, 2 + movsxd rdx, dword ptr arg(1) ;dst_stride + + ; read from top + movdqa xmm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + lea rcx, [rdx*3] + +.label + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + dec rsi + jnz .label + + ; begin epilog + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_ho_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_y_ho_sse2) PRIVATE +sym(vp8_intra_pred_y_ho_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + ;arg(2) not used + + ; read from left and write out + mov edx, 8 + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +vp8_intra_pred_y_ho_sse2_loop: + mov bl, [rsi] + movd xmm0, ebx + mov bl, [rsi+rax] + movd xmm1, ebx + + ; FIXME use pshufb for ssse3 version + punpcklbw xmm0, xmm0 + punpcklbw xmm1, xmm1 + pshuflw xmm0, xmm0, 0x0 + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + movdqa [rdi ], xmm0 + movdqa [rdi+rcx], xmm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_y_ho_sse2_loop + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +dc_128: + times 16 db 128 +dc_4: + times 4 dw 4 +align 16 +dc_8: + times 8 dw 8 +align 16 +dc_1024: + times 8 dw 0x400 +align 16 +dc_00001111: + times 8 db 0 + times 8 db 1 diff --git a/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c new file mode 100644 index 000000000..65f4251a9 --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/blockd.h" + +#define build_intra_predictors_mbuv_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *above, \ + const unsigned char *left, int left_stride) +typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t)); + +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3); + +static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_stride, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + build_intra_predictors_mbuv_fn_t tm_func, + build_intra_predictors_mbuv_fn_t ho_func) +{ + int mode = x->mode_info_context->mbmi.uv_mode; + build_intra_predictors_mbuv_fn_t fn; + + switch (mode) { + case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break; + case H_PRED: fn = ho_func; break; + case TM_PRED: fn = tm_func; break; + case DC_PRED: + if (x->up_available) { + if (x->left_available) { + fn = vp8_intra_pred_uv_dc_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dctop_mmx2; break; + } + } else if (x->left_available) { + fn = vp8_intra_pred_uv_dcleft_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dc128_mmx; break; + } + break; + default: return; + } + + fn(dst_u, dst_stride, uabove_row, uleft, left_stride); + fn(dst_v, dst_stride, vabove_row, vleft, left_stride); +} + +void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, + vp8_intra_pred_uv_tm_sse2, + vp8_intra_pred_uv_ho_mmx2); +} + +void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, + vp8_intra_pred_uv_tm_ssse3, + vp8_intra_pred_uv_ho_ssse3); +} + +#define build_intra_predictors_mby_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *above, \ + const unsigned char *left, int left_stride) +typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t)); + +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3); + +static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char *dst_y, + int dst_stride, + unsigned char * yleft, + int left_stride, + build_intra_predictors_mby_fn_t tm_func) +{ + int mode = x->mode_info_context->mbmi.mode; + build_intra_predictors_mbuv_fn_t fn; + + switch (mode) { + case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break; + case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break; + case TM_PRED: fn = tm_func; break; + case DC_PRED: + if (x->up_available) { + if (x->left_available) { + fn = vp8_intra_pred_y_dc_sse2; break; + } else { + fn = vp8_intra_pred_y_dctop_sse2; break; + } + } else if (x->left_available) { + fn = vp8_intra_pred_y_dcleft_sse2; break; + } else { + fn = vp8_intra_pred_y_dc128_sse2; break; + } + break; + default: return; + } + + fn(dst_y, dst_stride, yabove_row, yleft, left_stride); + return; +} + +void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, + vp8_intra_pred_y_tm_sse2); +} + +void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, + vp8_intra_pred_y_tm_ssse3); + +} diff --git a/media/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 000000000..47dd45229 --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,702 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1d_h6_mmx) PRIVATE +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1dc_v6_mmx) PRIVATE +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x8_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_mmx) PRIVATE +sym(vp8_bilinear_predict8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + shl rax, 5 ; offset * 32 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + + shl rax, 5 ; offset*32 + add rax, rcx ; VFilter + + lea rcx, [rdi+rdx*8] ; + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x8: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 ;dst_pitch +%endif + cmp rdi, rcx ; + jne .next_row_8x8 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x4_mmx) PRIVATE +sym(vp8_bilinear_predict8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + mov rsi, arg(0) ;src_ptr ; + add rax, rcx + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x4: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 +%endif + cmp rdi, rcx ; + jne .next_row_8x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict4x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict4x4_mmx) PRIVATE +sym(vp8_bilinear_predict4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;ldst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movq mm7, mm3 ; + packuswb mm7, mm0 ; + + add rsi, rdx ; next line +.next_row_4x4: + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + + movq mm5, mm7 ; + punpcklbw mm5, mm0 ; + + pmullw mm5, [rax] ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + movq mm7, mm3 ; + + packuswb mm7, mm0 ; + + pmullw mm3, [rax+16] ; + paddw mm3, mm5 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb mm3, mm0 + movd [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch ; + add rsi, rdx ; next line + add rdi, r8 +%endif + + cmp rdi, rcx ; + jne .next_row_4x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp8_six_tap_mmx)) +sym(vp8_six_tap_mmx): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + diff --git a/media/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/vp8/common/x86/subpixel_sse2.asm new file mode 100644 index 000000000..69f8d103c --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -0,0 +1,1372 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +global sym(vp8_filter_block1d8_h6_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_h6_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d8_v6_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_v6_sse2 +;( +; unsigned short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; const short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_v6_sse2) PRIVATE +sym(vp8_filter_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d16_v6_sse2_loop: +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] + pmullw xmm1, [rax + 16] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm3, [rax + 64] + pmullw xmm4, [rax + 64] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm5, [rax + 32] + pmullw xmm6, [rax + 32] + + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 + movdqa xmm0, XMMWORD PTR [rsi + 16] + pmullw xmm7, [rax] + pmullw xmm0, [rax] + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + paddsw xmm1, xmm7 + paddsw xmm2, xmm0 + + add rsi, rdx + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm3, [rax + 48] + pmullw xmm4, [rax + 48] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm5, [rax + 80] + pmullw xmm6, [rax + 80] + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] + pxor xmm0, xmm0 ; clear xmm0 + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + + paddsw xmm1, xmm7 + paddsw xmm2, xmm7 + + psraw xmm1, 7 + psraw xmm2, 7 + + packuswb xmm1, xmm2 ; pack and saturate + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_only_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + + movq QWORD PTR [rdi], xmm4 ; store the results in the destination + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_only_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_only_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; lower 8 bytes + + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; higher 8 bytes + + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; Second-pass filter only when xoffset==0 +global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + mov rax, arg(5) ;vp8_filter + + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_only_sse2_loop: + movq xmm1, MMWORD PTR [rsi] + movq xmm2, MMWORD PTR [rsi + rdx] + movq xmm3, MMWORD PTR [rsi + rdx * 2] + movq xmm5, MMWORD PTR [rsi + rdx * 4] + add rsi, rdx + movq xmm4, MMWORD PTR [rsi + rdx * 2] + movq xmm6, MMWORD PTR [rsi + rdx * 4] + + punpcklbw xmm1, xmm0 + pmullw xmm1, [rax] + + punpcklbw xmm2, xmm0 + pmullw xmm2, [rax + 16] + + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax + 32] + + punpcklbw xmm5, xmm0 + pmullw xmm5, [rax + 64] + + punpcklbw xmm4, xmm0 + pmullw xmm4, [rax + 48] + + punpcklbw xmm6, xmm0 + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE +sym(vp8_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +.unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz .unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict16x16_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp8_bilinear_filters_x86_8) +global sym(vp8_bilinear_predict16x16_sse2) PRIVATE +sym(vp8_bilinear_predict16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + movsxd rax, dword ptr arg(2) ;xoffset + + cmp rax, 0 ;skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + + cmp rax, 0 ;skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;dst_pitch +%endif + ; get the first horizontal line done + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + add rsi, rdx ; next line +.next_row: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, [rax] + pmullw xmm6, [rax] + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + pmullw xmm3, [rax+16] + pmullw xmm4, [rax+16] + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rdx ; next line +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ;dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + add rsi, rax ; next line +.next_row_spo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + movdqa xmm4, xmm3 ; make a copy of current line + movdqa xmm7, xmm3 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm5, xmm1 + pmullw xmm6, xmm1 + pmullw xmm3, xmm2 + pmullw xmm4, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ;dst_pitch + cmp rdi, rcx + jne .next_row_spo + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + pxor xmm0, xmm0 + +.next_row_fpo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ; dst_pitch + cmp rdi, rcx + jne .next_row_fpo + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict8x8_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_sse2) PRIVATE +sym(vp8_bilinear_predict8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ;xoffset + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm5, [rax] + movdqa xmm6, [rax+16] + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqa xmm3, XMMWORD PTR [rsp] + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + add rsp, 16 ; next line +.next_row8x8: + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + pmullw xmm7, xmm5 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm4, xmm3 + + pmullw xmm3, xmm6 + paddw xmm3, xmm7 + + movdqa xmm7, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb xmm3, xmm0 + movq [rdi], xmm3 ; store the results in the destination + + add rsp, 16 ; next line + add rdi, rdx + + cmp rdi, rcx + jne .next_row8x8 + + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/media/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm new file mode 100644 index 000000000..c06f24556 --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -0,0 +1,1508 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE +sym(vp8_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [GLOBAL(rd)] + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +.filter_block1d8_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + pmaddubsw xmm1, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm1 + paddsw xmm2, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz .filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx + +.filter_block1d8_h4_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 + + pshufb xmm2, xmm4 + pmaddubsw xmm0, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz .filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE +sym(vp8_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d16_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + movq xmm3, MMWORD PTR [rsi + 6] + + pmaddubsw xmm1, xmm5 + movq xmm7, MMWORD PTR [rsi + 11] + + pmaddubsw xmm2, xmm6 + punpcklbw xmm3, xmm7 + + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + + movdqa xmm2, xmm1 + paddsw xmm0, [GLOBAL(rd)] + + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + + psraw xmm0, 7 + pmaddubsw xmm1, xmm5 + + pmaddubsw xmm2, xmm6 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + + paddsw xmm3, xmm2 + + paddsw xmm3, [GLOBAL(rd)] + + psraw xmm3, 7 + + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE +sym(vp8_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + movdqa xmm7, [GLOBAL(rd)] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +.filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf1b)] + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2b)] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movd DWORD PTR [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d4_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + psraw xmm1, 7 + packuswb xmm1, xmm1 + + movd DWORD PTR [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp8_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE +sym(vp8_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +.vp8_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +.vp8_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [GLOBAL(rd)] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [GLOBAL(rd)] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE +sym(vp8_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [GLOBAL(rd)] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d4_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE +sym(vp8_filter_block1d4_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_v4_ssse3 + + movq mm5, MMWORD PTR [rax] ;k0_k5 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v6_ssse3_loop: + movd mm1, DWORD PTR [rsi] ;A + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + movd mm0, DWORD PTR [rax + rdx * 4] ;F + + movq mm4, [GLOBAL(rd)] + + pmaddubsw mm3, mm6 + punpcklbw mm1, mm0 ;A F + pmaddubsw mm2, mm7 + pmaddubsw mm1, mm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm1 + paddsw mm2, mm4 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_v4_ssse3: + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + movq mm5, MMWORD PTR [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v4_ssse3_loop: + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + pmaddubsw mm3, mm6 + pmaddubsw mm2, mm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm5 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict16x16_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE +sym(vp8_bilinear_predict16x16_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(2) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 4 + lea rax, [rax + rcx] ; HFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line + + movdqa xmm2, [rax] + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ; dst_pitch +%endif + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 + + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm6, xmm5 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm6, xmm1 + + punpcklbw xmm4, xmm5 + pmaddubsw xmm4, xmm1 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + packuswb xmm6, xmm4 + movdqa xmm5, xmm7 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm2 + + punpckhbw xmm7, xmm6 + pmaddubsw xmm7, xmm2 + + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value + psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm5, xmm7 + movdqa xmm7, xmm6 + + movdqa [rdi], xmm5 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ; dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + + ; get the first horizontal line done + movq xmm4, [rsi] ; load row 0 + movq xmm2, [rsi + 8] ; load row 0 + + lea rsi, [rsi + rax] ; next line +.next_row_sp: + movq xmm3, [rsi] ; load row + 1 + movq xmm5, [rsi + 8] ; load row + 1 + + punpcklbw xmm4, xmm3 + punpcklbw xmm2, xmm5 + + pmaddubsw xmm4, xmm1 + movq xmm7, [rsi + rax] ; load row + 2 + + pmaddubsw xmm2, xmm1 + movq xmm6, [rsi + rax + 8] ; load row + 2 + + punpcklbw xmm3, xmm7 + punpcklbw xmm5, xmm6 + + pmaddubsw xmm3, xmm1 + paddw xmm4, [GLOBAL(rd)] + + pmaddubsw xmm5, xmm1 + paddw xmm2, [GLOBAL(rd)] + + psraw xmm4, VP8_FILTER_SHIFT + psraw xmm2, VP8_FILTER_SHIFT + + packuswb xmm4, xmm2 + paddw xmm3, [GLOBAL(rd)] + + movdqa [rdi], xmm4 ; store row 0 + paddw xmm5, [GLOBAL(rd)] + + psraw xmm3, VP8_FILTER_SHIFT + psraw xmm5, VP8_FILTER_SHIFT + + packuswb xmm3, xmm5 + movdqa xmm4, xmm7 + + movdqa [rdi + rdx],xmm3 ; store row 1 + lea rsi, [rsi + 2*rax] + + movdqa xmm2, xmm6 + lea rdi, [rdi + 2*rdx] + + cmp rdi, rcx + jne .next_row_sp + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + +.next_row_fp: + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm2, xmm4 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 + + pmaddubsw xmm2, xmm1 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rax] ; next line + punpcklbw xmm3, xmm4 + + pmaddubsw xmm3, xmm1 + movq xmm5, [rsi] + + paddw xmm2, [GLOBAL(rd)] + movq xmm7, [rsi+1] + + movq xmm6, [rsi+8] + psraw xmm2, VP8_FILTER_SHIFT + + punpcklbw xmm5, xmm7 + movq xmm7, [rsi+9] + + paddw xmm3, [GLOBAL(rd)] + pmaddubsw xmm5, xmm1 + + psraw xmm3, VP8_FILTER_SHIFT + punpcklbw xmm6, xmm7 + + packuswb xmm2, xmm3 + pmaddubsw xmm6, xmm1 + + movdqa [rdi], xmm2 ; store the results in the destination + paddw xmm5, [GLOBAL(rd)] + + lea rdi, [rdi + rdx] ; dst_pitch + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm6, VP8_FILTER_SHIFT + + packuswb xmm5, xmm6 + lea rsi, [rsi + rax] ; next line + + movdqa [rdi], xmm5 ; store the results in the destination + lea rdi, [rdi + rdx] ; dst_pitch + + cmp rdi, rcx + + jne .next_row_fp + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict8x8_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE +sym(vp8_bilinear_predict8x8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ; xoffset + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b8x8_sp_only + + shl rax, 4 + add rax, rcx ; HFilter + + mov rdi, arg(4) ; dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b8x8_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm1, [rax] + + ; get the first horizontal line done + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx + + psrldq xmm5, 1 + lea rsp, [rsp + 16] ; next line + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + lea rsp, [rsp + 16] ; next line + + movdqa xmm5, xmm6 + + psrldq xmm5, 1 + + punpcklbw xmm6, xmm5 + pmaddubsw xmm6, xmm0 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + packuswb xmm6, xmm6 + + punpcklbw xmm7, xmm6 + pmaddubsw xmm7, xmm1 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm7, xmm7 + + movq [rdi], xmm7 ; store the results in the destination + lea rdi, [rdi + rdx] + + movdqa xmm7, xmm6 + + cmp rdi, rcx + jne .next_row + + jmp .done8x8 + +.b8x8_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] ; VFilter + + movq xmm1, XMMWORD PTR [rsp] + movq xmm2, XMMWORD PTR [rsp+16] + + movq xmm3, XMMWORD PTR [rsp+32] + punpcklbw xmm1, xmm2 + + movq xmm4, XMMWORD PTR [rsp+48] + punpcklbw xmm2, xmm3 + + movq xmm5, XMMWORD PTR [rsp+64] + punpcklbw xmm3, xmm4 + + movq xmm6, XMMWORD PTR [rsp+80] + punpcklbw xmm4, xmm5 + + movq xmm7, XMMWORD PTR [rsp+96] + punpcklbw xmm5, xmm6 + + pmaddubsw xmm1, xmm0 + pmaddubsw xmm2, xmm0 + + pmaddubsw xmm3, xmm0 + pmaddubsw xmm4, xmm0 + + pmaddubsw xmm5, xmm0 + punpcklbw xmm6, xmm7 + + pmaddubsw xmm6, xmm0 + paddw xmm1, [GLOBAL(rd)] + + paddw xmm2, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm2, VP8_FILTER_SHIFT + + paddw xmm4, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + psraw xmm6, VP8_FILTER_SHIFT + packuswb xmm1, xmm1 + + packuswb xmm2, xmm2 + movq [rdi], xmm1 + + packuswb xmm3, xmm3 + movq [rdi+rdx], xmm2 + + packuswb xmm4, xmm4 + movq xmm1, XMMWORD PTR [rsp+112] + + lea rdi, [rdi + 2*rdx] + movq xmm2, XMMWORD PTR [rsp+128] + + packuswb xmm5, xmm5 + movq [rdi], xmm3 + + packuswb xmm6, xmm6 + movq [rdi+rdx], xmm4 + + lea rdi, [rdi + 2*rdx] + punpcklbw xmm7, xmm1 + + movq [rdi], xmm5 + pmaddubsw xmm7, xmm0 + + movq [rdi+rdx], xmm6 + punpcklbw xmm1, xmm2 + + pmaddubsw xmm1, xmm0 + paddw xmm7, [GLOBAL(rd)] + + psraw xmm7, VP8_FILTER_SHIFT + paddw xmm1, [GLOBAL(rd)] + + psraw xmm1, VP8_FILTER_SHIFT + packuswb xmm7, xmm7 + + packuswb xmm1, xmm1 + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm7 + + movq [rdi+rdx], xmm1 + lea rsp, [rsp + 144] + + jmp .done8x8 + +.b8x8_fp_only: + lea rcx, [rdi+rdx*8] + +.next_row_fp: + movdqa xmm1, XMMWORD PTR [rsp] + movdqa xmm3, XMMWORD PTR [rsp+16] + + movdqa xmm2, xmm1 + movdqa xmm5, XMMWORD PTR [rsp+32] + + psrldq xmm2, 1 + movdqa xmm7, XMMWORD PTR [rsp+48] + + movdqa xmm4, xmm3 + psrldq xmm4, 1 + + movdqa xmm6, xmm5 + psrldq xmm6, 1 + + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xmm0 + + punpcklbw xmm3, xmm4 + pmaddubsw xmm3, xmm0 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm0 + + movdqa xmm2, xmm7 + psrldq xmm2, 1 + + punpcklbw xmm7, xmm2 + pmaddubsw xmm7, xmm0 + + paddw xmm1, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm7, [GLOBAL(rd)] + psraw xmm7, VP8_FILTER_SHIFT + + packuswb xmm1, xmm1 + packuswb xmm3, xmm3 + + packuswb xmm5, xmm5 + movq [rdi], xmm1 + + packuswb xmm7, xmm7 + movq [rdi+rdx], xmm3 + + lea rdi, [rdi + 2*rdx] + movq [rdi], xmm5 + + lea rsp, [rsp + 4*16] + movq [rdi+rdx], xmm7 + + lea rdi, [rdi + 2*rdx] + cmp rdi, rcx + + jne .next_row_fp + + lea rsp, [rsp + 16] + +.done8x8: + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +shuf2bfrom1: + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 +align 16 +shuf3bfrom1: + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 + diff --git a/media/libvpx/vp8/common/x86/variance_impl_sse2.asm b/media/libvpx/vp8/common/x86/variance_impl_sse2.asm new file mode 100644 index 000000000..26de5e860 --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_impl_sse2.asm @@ -0,0 +1,972 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + +;void vp8_filter_block2d_bil_var_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE +sym(vp8_filter_block2d_bil_var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 ; + pxor xmm7, xmm7 ; + + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] + + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + movdqa xmm5, xmm1 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_sse2_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movdqa xmm3, xmm5 ; + movdqa xmm5, xmm1 ; + + pmullw xmm3, [rdx] ; + pmullw xmm1, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 ; + jnz filter_block2d_bil_var_sse2_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(7) ; sum + mov rdi, arg(8) ; sumsquared + + movd [rsi], mm2 ; xsum + movd [rdi], mm4 ; xxsum + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_vert_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE +sym(vp8_half_horiz_vert_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source +%else + add rsi, r8 +%endif + +vp8_half_horiz_vert_variance8x_h_1: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm2, QWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +vp8_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE +sym(vp8_half_vert_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; +vp8_half_vert_variance8x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_vert_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr + + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +vp8_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz vp8_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE +sym(vp8_half_horiz_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; +vp8_half_horiz_variance8x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz vp8_half_horiz_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + +vp8_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm new file mode 100644 index 000000000..686b4a902 --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm @@ -0,0 +1,364 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + + +;void vp8_filter_block2d_bil_var_ssse3 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +;Note: The filter coefficient at offset=0 is 128. Since the second register +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. +global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE +sym(vp8_filter_block2d_bil_var_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .filter_block2d_bil_var_ssse3_sp_only + + shl rax, 4 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je .filter_block2d_bil_var_ssse3_fp_only + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi+1] + movdqa xmm2, xmm0 + + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, [rax] + pmaddubsw xmm2, [rax] + + paddw xmm0, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm0, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + packuswb xmm0, xmm2 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + r8] +%endif + +.filter_block2d_bil_var_ssse3_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + packuswb xmm1, xmm3 + + movdqa xmm2, xmm0 + movdqa xmm0, xmm1 + movdqa xmm3, xmm2 + + punpcklbw xmm2, xmm1 + punpckhbw xmm3, xmm1 + pmaddubsw xmm2, [rdx] + pmaddubsw xmm3, [rdx] + + paddw xmm2, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm2, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm1, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm1, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm2, xmm1 + psubw xmm3, xmm5 + paddw xmm6, xmm2 + paddw xmm6, xmm3 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm2 + paddd xmm7, xmm3 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rsi, [rsi + r8] + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_var_ssse3_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 + je .filter_block2d_bil_var_ssse3_full_pixel + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + movdqu xmm1, XMMWORD PTR [rsi] + movdqa xmm0, xmm1 + +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + lea rsi, [rsi + rax] + +.filter_block2d_bil_sp_only_loop: + movdqu xmm3, XMMWORD PTR [rsi] + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + + punpcklbw xmm1, xmm3 + punpckhbw xmm2, xmm3 + pmaddubsw xmm1, [rdx] + pmaddubsw xmm2, [rdx] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + movq xmm3, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm3, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm3 + psubw xmm2, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + movdqa xmm1, xmm0 + lea rsi, [rsi + rax] ;ref_pixels_per_line + +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_sp_only_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 + +.filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] + punpcklbw xmm1, xmm0 + movq xmm2, QWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + + movq xmm3, QWORD PTR [rdi] + punpcklbw xmm3, xmm0 + movq xmm4, QWORD PTR [rdi+8] + punpcklbw xmm4, xmm0 + + psubw xmm1, xmm3 + psubw xmm2, xmm4 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rdx] ;src_pixels_per_line + sub rcx, 1 + jnz .filter_block2d_bil_full_pixel_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +.filter_block2d_bil_fp_only_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm2, XMMWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm2, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm2 + psubw xmm3, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm3 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm1 + paddd xmm7, xmm3 + + lea rsi, [rsi + rdx] +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_fp_only_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_variance: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(7) ;[Sum] + mov rdi, arg(8) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 diff --git a/media/libvpx/vp8/common/x86/variance_ssse3.c b/media/libvpx/vp8/common/x86/variance_ssse3.c new file mode 100644 index 000000000..2a0df640a --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_ssse3.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" + +extern void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_ssse3 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance16x16_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0; + unsigned int xxsum0; + + /* note we could avoid these if statements if the calling function + * just called the appropriate functions inside. + */ + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_variance16x8_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0; + unsigned int xxsum0; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} diff --git a/media/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/vp8/common/x86/vp8_asm_stubs.c new file mode 100644 index 000000000..fb0b57eb1 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "filter_x86.h" + +extern const short vp8_six_tap_mmx[8][6*8]; + +extern void vp8_filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1dc_v6_mmx +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_unpack_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_filter_block1d8_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); + + +#if HAVE_MMX +void vp8_sixtap_predict4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); + +} + + +void vp8_sixtap_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + + HFilter = vp8_six_tap_mmx[xoffset]; + + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); + +} + + +void vp8_sixtap_predict8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); + +} + + +void vp8_sixtap_predict8x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); + +} + + + +void vp8_bilinear_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); +} +#endif + + +#if HAVE_SSE2 +void vp8_sixtap_predict16x16_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } +} + + +void vp8_sixtap_predict8x8_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); + } +} + + +void vp8_sixtap_predict8x4_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); + } +} + +#endif + +#if HAVE_SSSE3 + +extern void vp8_filter_block1d8_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d8_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +void vp8_sixtap_predict16x16_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[24*24]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 16, 21, xoffset); + vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, + 16, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 16, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 16, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict8x8_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 13, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 8, yoffset); + } + else + { + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 8, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 8, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + + +void vp8_sixtap_predict8x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 9, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict4x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[4*9]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + FData2, 4, 9, xoffset); + vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + int r; + + for (r = 0; r < 4; r++) + { + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr += dst_pitch; + src_ptr += src_pixels_per_line; + } + } + } +} + +#endif diff --git a/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm b/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm new file mode 100644 index 000000000..97f25275d --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define mmx_filter_shift 7 + +;void vp8_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +.filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + + +;void vp8_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +.filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/media/libvpx/vp8/common/x86/vp8_variance_mmx.c b/media/libvpx/vp8/common/x86/vp8_variance_mmx.c new file mode 100644 index 000000000..e594b1e65 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_mmx.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" + +extern void filter_block1d_h6_mmx +( + const unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *filter +); +extern void filter_block1d_v6_mmx +( + const short *src_ptr, + unsigned char *output_ptr, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *filter +); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance4x4_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) + +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); + + +} + +unsigned int vp8_sub_pixel_variance16x8_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +} + + +unsigned int vp8_variance_halfpixvar16x16_h_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, + ref_ptr, recon_stride, sse); +} diff --git a/media/libvpx/vp8/common/x86/vp8_variance_sse2.c b/media/libvpx/vp8/common/x86/vp8_variance_sse2.c new file mode 100644 index 000000000..1c15ed880 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_sse2.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" + +extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +void vp8_filter_block2d_bil_var_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance4x4_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum, &xxsum); + } + + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + /* note we could avoid these if statements if the calling function + * just called the appropriate functions inside. + */ + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0 + ); + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum1, &xxsum1 + ); + xsum0 += xsum1; + xxsum0 += xxsum1; + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_variance16x8_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum1, &xxsum1); + xsum0 += xsum1; + xxsum0 += xxsum1; + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum, &xxsum); + } + + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +} + + +unsigned int vp8_variance_halfpixvar16x16_h_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} |