diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /media/libvpx/vp8/encoder/x86 | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/libvpx/vp8/encoder/x86')
-rw-r--r-- | media/libvpx/vp8/encoder/x86/dct_mmx.asm | 241 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/dct_sse2.asm | 432 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/denoising_sse2.c | 379 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/encodeopt.asm | 386 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm | 164 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/quantize_mmx.asm | 286 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/quantize_sse2.c | 228 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/quantize_sse4.c | 128 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/quantize_ssse3.c | 114 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm | 216 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/subtract_mmx.asm | 223 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/subtract_sse2.asm | 245 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm | 207 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c | 78 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c | 43 |
15 files changed, 3370 insertions, 0 deletions
diff --git a/media/libvpx/vp8/encoder/x86/dct_mmx.asm b/media/libvpx/vp8/encoder/x86/dct_mmx.asm new file mode 100644 index 000000000..6f188cb94 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/dct_mmx.asm @@ -0,0 +1,241 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_mmx) PRIVATE +sym(vp8_short_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output + + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movq mm0, [rsi] + movq mm1, [rsi + rax] + + movq mm2, [rcx] + movq mm4, [rcx + rax] + + ; transpose for the first stage + movq mm3, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 20 21 22 23 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm3, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm4 ; 20 30 21 31 + punpckhwd mm5, mm4 ; 22 32 23 33 + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm3 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm3, mm5 ; 03 13 23 33 + + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 3 + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 + + psubw mm4, mm2 ; c1 = 1 - 2 + psubw mm5, mm3 ; d1 = 0 - 3 + + psllw mm5, 3 + psllw mm4, 3 + + psllw mm0, 3 + psllw mm1, 3 + + ; output 0 and 2 + movq mm2, mm0 ; a1 + + paddw mm0, mm1 ; op[0] = a1 + b1 + psubw mm2, mm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm4 ; c1 d1 + punpckhwd mm5, mm4 ; c1 d1 + + movq mm3, mm1 + movq mm4, mm5 + + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd mm1, MMWORD PTR[GLOBAL(_14500)] + paddd mm4, MMWORD PTR[GLOBAL(_14500)] + paddd mm3, MMWORD PTR[GLOBAL(_7500)] + paddd mm5, MMWORD PTR[GLOBAL(_7500)] + + psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw mm1, mm4 ; op[1] + packssdw mm3, mm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movq mm4, mm0 ; 00 10 20 30 + movq mm5, mm2 ; 02 12 22 32 + + punpcklwd mm0, mm1 ; 00 01 10 11 + punpckhwd mm4, mm1 ; 20 21 30 31 + + punpcklwd mm2, mm3 ; 02 03 12 13 + punpckhwd mm5, mm3 ; 22 23 32 33 + + movq mm1, mm0 ; 00 01 10 11 + punpckldq mm0, mm2 ; 00 01 02 03 + + punpckhdq mm1, mm2 ; 01 22 12 13 + + movq mm2, mm4 ; 20 31 30 31 + punpckldq mm2, mm5 ; 20 21 22 23 + + punpckhdq mm4, mm5 ; 30 31 32 33 + + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 4 + + movq mm5, mm0 + movq mm3, mm1 + + paddw mm0, mm4 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 + + psubw mm3, mm2 ; c1 = 1 - 2 + psubw mm5, mm4 ; d1 = 0 - 3 + + pxor mm6, mm6 ; zero out for compare + + pcmpeqw mm6, mm5 ; d1 != 0 + + pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movq mm2, mm0 ; a1 + + paddw mm0, mm1 ; a1 + b1 + psubw mm2, mm1 ; a1 - b1 + + paddw mm0, MMWORD PTR[GLOBAL(_7w)] + paddw mm2, MMWORD PTR[GLOBAL(_7w)] + + psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + movq MMWORD PTR[rdi + 0 ], mm0 + movq MMWORD PTR[rdi + 16], mm2 + + ; output 1 and 3 + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm3 ; c1 d1 + punpckhwd mm5, mm3 ; c1 d1 + + movq mm3, mm1 + movq mm4, mm5 + + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd mm1, MMWORD PTR[GLOBAL(_12000)] + paddd mm4, MMWORD PTR[GLOBAL(_12000)] + paddd mm3, MMWORD PTR[GLOBAL(_51000)] + paddd mm5, MMWORD PTR[GLOBAL(_51000)] + + psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw mm1, mm4 ; op[4] + packssdw mm3, mm5 ; op[12] + + paddw mm1, mm6 ; op[4] += (d1!=0) + + movq MMWORD PTR[rdi + 8 ], mm1 + movq MMWORD PTR[rdi + 24], mm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 8 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 8 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 8 +_cmp_mask: + times 4 dw 1 +align 8 +_7w: + times 4 dw 7 +align 8 +_14500: + times 2 dd 14500 +align 8 +_7500: + times 2 dd 7500 +align 8 +_12000: + times 2 dd 12000 +align 8 +_51000: + times 2 dd 51000 diff --git a/media/libvpx/vp8/encoder/x86/dct_sse2.asm b/media/libvpx/vp8/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..d06bca592 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -0,0 +1,432 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE 0 +%if ABI_IS_32BIT + %define input rsi + %define output rdi + %define pitch rax + push rbp + mov rbp, rsp + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movsxd rax, dword ptr arg(2) + lea rcx, [rsi + rax*2] +%else + %if LIBVPX_YASM_WIN64 + %define input rcx + %define output rdx + %define pitch r8 + SAVE_XMM 7, u + %else + %define input rdi + %define output rsi + %define pitch rdx + %endif +%endif +%endmacro + +%macro STACK_FRAME_DESTROY 0 + %define input + %define output + %define pitch + +%if ABI_IS_32BIT + pop rdi + pop rsi + RESTORE_GOT + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_sse2) PRIVATE +sym(vp8_short_fdct4x4_sse2): + + STACK_FRAME_CREATE + + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 + lea input, [input+2*pitch] + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 + + punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 + punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 + punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 + pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx + pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx + + punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 + movdqa xmm3, xmm0 + paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 + psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 + psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 + psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 + + paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] + psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm0, xmm1 ;op[2] op[0] + packssdw xmm3, xmm4 ;op[3] op[1] + ; 23 22 21 20 03 02 01 00 + ; + ; 33 32 31 30 13 12 11 10 + ; + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 + + movdqa xmm3, xmm0 + punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 + punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] + pshufd xmm2, xmm2, 04eh + movdqa xmm3, xmm0 + paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 + psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 + + pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 + movdqa xmm2, xmm3 ;save d1 for compare + pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 + pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 + pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 + pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 + pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + + pxor xmm4, xmm4 ;zero out for compare + paddd xmm0, xmm5 + paddd xmm1, xmm5 + pcmpeqw xmm2, xmm4 + psrad xmm0, 4 ;(a1 + b1 + 7)>>4 + psrad xmm1, 4 ;(a1 - b1 + 7)>>4 + pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, + ;and keep bit 0 of lower + + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 + paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] + packssdw xmm0, xmm1 ;op[8] op[0] + psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 + psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 + + packssdw xmm3, xmm4 ;op[12] op[4] + movdqa xmm1, xmm0 + paddw xmm3, xmm2 ;op[4] += (d1!=0) + punpcklqdq xmm0, xmm3 ;op[4] op[0] + punpckhqdq xmm1, xmm3 ;op[12] op[8] + + movdqa XMMWORD PTR[output + 0], xmm0 + movdqa XMMWORD PTR[output + 16], xmm1 + + STACK_FRAME_DESTROY + +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct8x4_sse2) PRIVATE +sym(vp8_short_fdct8x4_sse2): + + STACK_FRAME_CREATE + + ; read the input data + movdqa xmm0, [input ] + movdqa xmm2, [input+ pitch] + lea input, [input+2*pitch] + movdqa xmm4, [input ] + movdqa xmm3, [input+ pitch] + + ; transpose for the first stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm2 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + psllw xmm5, 3 + psllw xmm4, 3 + + psllw xmm0, 3 + psllw xmm1, 3 + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; op[0] = a1 + b1 + psubw xmm2, xmm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] + + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm1, xmm4 ; op[1] + packssdw xmm3, xmm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 + + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 + + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 + + ; xmm0 0 + ; xmm1 4 + ; xmm2 1 + ; xmm3 3 + + movdqa xmm5, xmm0 + movdqa xmm2, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm4 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + pxor xmm6, xmm6 ; zero out for compare + + pcmpeqw xmm6, xmm5 ; d1 != 0 + + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; a1 + b1 + psubw xmm2, xmm1 ; a1 - b1 + + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] + + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] + + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw xmm1, xmm4 ; op[4] + packssdw xmm3, xmm5 ; op[12] + + paddw xmm1, xmm6 ; op[4] += (d1!=0) + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + + punpcklqdq xmm0, xmm1 + punpckhqdq xmm4, xmm1 + + punpcklqdq xmm2, xmm3 + punpckhqdq xmm5, xmm3 + + movdqa XMMWORD PTR[output + 0 ], xmm0 + movdqa XMMWORD PTR[output + 16], xmm2 + movdqa XMMWORD PTR[output + 32], xmm4 + movdqa XMMWORD PTR[output + 48], xmm5 + + STACK_FRAME_DESTROY + +SECTION_RODATA +align 16 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 16 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 16 +_mult_add: + times 8 dw 1 +align 16 +_cmp_mask: + times 4 dw 1 + times 4 dw 0 +align 16 +_cmp_mask8x4: + times 8 dw 1 +align 16 +_mult_sub: + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 +align 16 +_7: + times 4 dd 7 +align 16 +_7w: + times 8 dw 7 +align 16 +_14500: + times 4 dd 14500 +align 16 +_7500: + times 4 dd 7500 +align 16 +_12000: + times 4 dd 12000 +align 16 +_51000: + times 4 dd 51000 diff --git a/media/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/vp8/encoder/x86/denoising_sse2.c new file mode 100644 index 000000000..101d646ef --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/encoder/denoising.h" +#include "vp8/common/reconinter.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8_rtcd.h" + +#include <emmintrin.h> +#include "vpx_ports/emmintrin_compat.h" + +/* Compute the sum of all pixel differences of this MB. */ +static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = _mm_srai_epi16( + _mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = _mm_srai_epi16( + _mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); + + return sum_diff; +} + +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, + int mc_avg_y_stride, + unsigned char *running_avg_y, int avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) +{ + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + for (r = 0; r < 16; ++r) + { + /* Calculate differences */ + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = _mm_loadu_si128( + (__m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = _mm_min_epu8( + _mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + + { + /* Compute the sum of all pixel differences of this MB. */ + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + __m128i v_running_avg_y = + _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, + int mc_avg_stride, + unsigned char *running_avg, int avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? + 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + { + const __m128i k_1 = _mm_set1_epi16(1); + __m128i vec_sum_block = _mm_setzero_si128(); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); + const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); + vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + const int sum_block = _mm_cvtsi128_si32(hgfedcba); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + for (r = 0; r < 4; ++r) { + /* Calculate differences */ + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = _mm_min_epu8( + _mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + __m128i v_running_avg; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg = _mm_adds_epu8(v_sig, padj); + v_running_avg = _mm_subs_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + + { + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 4; ++r) { + // Calculate differences. + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + const __m128i v_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&running_avg[0]))); + __m128i v_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), + (double *)(&running_avg[avg_stride]))); + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg = _mm_subs_epu8(v_running_avg, padj); + v_running_avg = _mm_adds_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} diff --git a/media/libvpx/vp8/encoder/x86/encodeopt.asm b/media/libvpx/vp8/encoder/x86/encodeopt.asm new file mode 100644 index 000000000..fe26b18e5 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/encodeopt.asm @@ -0,0 +1,386 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_xmm) PRIVATE +sym(vp8_block_error_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prologue + + mov rsi, arg(0) ;coeff_ptr + mov rdi, arg(1) ;dcoef_ptr + + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + psubw xmm0, xmm1 + psubw xmm2, xmm3 + + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 + + paddd xmm0, xmm2 + + pxor xmm5, xmm5 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + psrldq xmm0, 8 + paddd xmm0, xmm1 + + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_mmx) PRIVATE +sym(vp8_block_error_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + movq mm3, [rsi] + + movq mm4, [rdi] + movq mm5, [rsi+8] + + movq mm6, [rdi+8] + pxor mm1, mm1 ; from movd mm1, dc ; dc =0 + + movq mm2, mm7 + psubw mm5, mm6 + + por mm1, mm2 + pmaddwd mm5, mm5 + + pcmpeqw mm1, mm7 + psubw mm3, mm4 + + pand mm1, mm3 + pmaddwd mm1, mm1 + + paddd mm1, mm5 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm3, mm5 + + paddd mm1, mm3 + movq mm0, mm1 + + psrlq mm1, 32 + paddd mm0, mm1 + + movq rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_mmx_impl) PRIVATE +sym(vp8_mbblock_error_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + pxor mm2, mm2 + + movd mm1, dword ptr arg(2) ;dc + por mm1, mm2 + + pcmpeqw mm1, mm7 + mov rcx, 16 + +.mberror_loop_mmx: + movq mm3, [rsi] + movq mm4, [rdi] + + movq mm5, [rsi+8] + movq mm6, [rdi+8] + + + psubw mm5, mm6 + pmaddwd mm5, mm5 + + psubw mm3, mm4 + pand mm3, mm1 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + add rsi, 32 + + add rdi, 32 + sub rcx, 1 + + jnz .mberror_loop_mmx + + movq mm0, mm2 + psrlq mm2, 32 + + paddd mm0, mm2 + movq rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_xmm_impl) PRIVATE +sym(vp8_mbblock_error_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 6 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm6, xmm6 + + mov rdi, arg(1) ;dcoef_ptr + pxor xmm4, xmm4 + + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 + + pcmpeqw xmm5, xmm6 + mov rcx, 16 + +.mberror_loop: + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 + + psubw xmm0, xmm1 + pand xmm0, xmm5 + + pmaddwd xmm0, xmm0 + add rsi, 32 + + add rdi, 32 + + sub rcx, 1 + paddd xmm4, xmm2 + + paddd xmm4, xmm0 + jnz .mberror_loop + + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 + + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 + + movdqa xmm1, xmm0 + psrldq xmm0, 8 + + paddd xmm0, xmm1 + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_mmx_impl) PRIVATE +sym(vp8_mbuverror_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor mm7, mm7 + +.mbuverror_loop_mmx: + + movq mm1, [rsi] + movq mm2, [rdi] + + psubw mm1, mm2 + pmaddwd mm1, mm1 + + + movq mm3, [rsi+8] + movq mm4, [rdi+8] + + psubw mm3, mm4 + pmaddwd mm3, mm3 + + + paddd mm7, mm1 + paddd mm7, mm3 + + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz .mbuverror_loop_mmx + + movq mm0, mm7 + psrlq mm7, 32 + + paddd mm0, mm7 + movq rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_xmm_impl) PRIVATE +sym(vp8_mbuverror_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor xmm3, xmm3 + +.mbuverror_loop: + + movdqa xmm1, [rsi] + movdqa xmm2, [rdi] + + psubw xmm1, xmm2 + pmaddwd xmm1, xmm1 + + paddd xmm3, xmm1 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz .mbuverror_loop + + pxor xmm0, xmm0 + movdqa xmm1, xmm3 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + paddd xmm1, xmm2 + + movdqa xmm2, xmm1 + + psrldq xmm1, 8 + paddd xmm1, xmm2 + + movq rax, xmm1 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 000000000..f4989279f --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,164 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_walsh4x4_sse2) PRIVATE +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output + movsxd rdx, dword ptr arg(2) ; pitch + + ; first for loop + movq xmm0, MMWORD PTR [rsi] ; load input + movq xmm1, MMWORD PTR [rsi + rdx] + lea rsi, [rsi + rdx*2] + movq xmm2, MMWORD PTR [rsi] + movq xmm3, MMWORD PTR [rsi + rdx] + + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ; ip[1] ip[0] + punpckhdq xmm1, xmm2 ; ip[3] ip[2] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + psllw xmm0, 2 ; d1 a1 + psllw xmm2, 2 ; c1 b1 + + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 ; b1 a1 + punpckhqdq xmm1, xmm2 ; c1 d1 + + pxor xmm6, xmm6 + movq xmm6, xmm0 + pxor xmm7, xmm7 + pcmpeqw xmm7, xmm6 + paddw xmm7, [GLOBAL(c1)] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 ; b1+c1 a1+d1 + psubw xmm2, xmm1 ; b1-c1 a1-d1 + paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) + + ; second for loop + ; input: 13 9 5 1 12 8 4 0 (xmm0) + ; 14 10 6 2 15 11 7 3 (xmm2) + ; after shuffle: + ; 13 5 9 1 12 4 8 0 (xmm0) + ; 14 6 10 2 15 7 11 3 (xmm1) + pshuflw xmm3, xmm0, 0xd8 + pshufhw xmm0, xmm3, 0xd8 + pshuflw xmm3, xmm2, 0xd8 + pshufhw xmm1, xmm3, 0xd8 + + movdqa xmm2, xmm0 + pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 + pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 + movdqa xmm3, xmm1 + pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 + pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 + + pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 + pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 + pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 + pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 + + movdqa xmm0, xmm4 + punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 + punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 + movdqa xmm1, xmm6 + punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 + punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 + + movdqa xmm2, xmm0 + paddd xmm0, xmm4 ; b21 b20 a21 a20 + psubd xmm2, xmm4 ; c21 c20 d21 d20 + movdqa xmm3, xmm1 + paddd xmm1, xmm6 ; b23 b22 a23 a22 + psubd xmm3, xmm6 ; c23 c22 d23 d22 + + pxor xmm4, xmm4 + movdqa xmm5, xmm4 + pcmpgtd xmm4, xmm0 + pcmpgtd xmm5, xmm2 + pand xmm4, [GLOBAL(cd1)] + pand xmm5, [GLOBAL(cd1)] + + pxor xmm6, xmm6 + movdqa xmm7, xmm6 + pcmpgtd xmm6, xmm1 + pcmpgtd xmm7, xmm3 + pand xmm6, [GLOBAL(cd1)] + pand xmm7, [GLOBAL(cd1)] + + paddd xmm0, xmm4 + paddd xmm2, xmm5 + paddd xmm0, [GLOBAL(cd3)] + paddd xmm2, [GLOBAL(cd3)] + paddd xmm1, xmm6 + paddd xmm3, xmm7 + paddd xmm1, [GLOBAL(cd3)] + paddd xmm3, [GLOBAL(cd3)] + + psrad xmm0, 3 + psrad xmm1, 3 + psrad xmm2, 3 + psrad xmm3, 3 + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 + punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 + movdqa xmm5, xmm2 + punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 + punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 + + packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 + packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm2 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +c1: + dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 +align 16 +cn1: + dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff +align 16 +cd1: + dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 +align 16 +cd3: + dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 diff --git a/media/libvpx/vp8/encoder/x86/quantize_mmx.asm b/media/libvpx/vp8/encoder/x86/quantize_mmx.asm new file mode 100644 index 000000000..2864ce16d --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_mmx.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE +sym(vp8_fast_quantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + movq mm0, [rsi] + + mov rax, arg(1) ;zbin_ptr + movq mm1, [rax] + + movq mm3, mm0 + psraw mm0, 15 + + pxor mm3, mm0 + psubw mm3, mm0 ; abs + + movq mm2, mm3 + pcmpgtw mm1, mm2 + + pandn mm1, mm2 + movq mm3, mm1 + + mov rdx, arg(6) ;quant_ptr + movq mm1, [rdx] + + mov rcx, arg(5) ;round_ptr + movq mm2, [rcx] + + paddw mm3, mm2 + pmulhuw mm3, mm1 + + pxor mm3, mm0 + psubw mm3, mm0 ;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + movq mm0, mm3 + + movq [rdi], mm3 + + mov rax, arg(3) ;dequant_ptr + movq mm2, [rax] + + pmullw mm3, mm2 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax], mm3 + + ; next 8 + movq mm4, [rsi+8] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+8] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+8] + movq mm6, [rcx+8] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+8], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+8] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+8], mm7 + + + ; next 8 + movq mm4, [rsi+16] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+16] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+16] + movq mm6, [rcx+16] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+16], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+16] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+16], mm7 + + + ; next 8 + movq mm4, [rsi+24] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+24] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+24] + movq mm6, [rcx+24] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+24], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+24] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+24], mm7 + + + + mov rdi, arg(4) ;scan_mask + mov rsi, arg(2) ;qcoeff_ptr + + pxor mm5, mm5 + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rsi+8] + + movq mm2, [rdi] + movq mm3, [rdi+8]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + movq mm5, mm0 + + paddd mm5, mm1 + + movq mm0, [rsi+16] + movq mm1, [rsi+24] + + movq mm2, [rdi+16] + movq mm3, [rdi+24]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + paddd mm5, mm0 + + paddd mm5, mm1 + movq mm0, mm5 + + psrlq mm5, 32 + paddd mm0, mm5 + + ; eob adjustment begins here + movq rcx, mm0 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx ; rdx=-rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + ; Substitute the sse assembly for the old mmx mixed assembly/C. The + ; following is kept as reference + ; movq rcx, mm0 + ; bsr rax, rcx + ; + ; mov eob, rax + ; mov eee, rcx + ; + ;if(eee==0) + ;{ + ; eob=-1; + ;} + ;else if(eee<0) + ;{ + ; eob=15; + ;} + ;d->eob = eob+1; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/encoder/x86/quantize_sse2.c b/media/libvpx/vp8/encoder/x86/quantize_sse2.c new file mode 100644 index 000000000..b4e92e04b --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_sse2.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#include <mmintrin.h> /* MMX */ +#include <xmmintrin.h> /* SSE */ +#include <emmintrin.h> /* SSE2 */ + +#define SELECT_EOB(i, z) \ + do { \ + short boost = *zbin_boost_ptr; \ + int cmp = (x[z] < boost) | (y[z] == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + break; \ + qcoeff_ptr[z] = y[z]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } while (0) + +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + char eob = 0; + short *zbin_boost_ptr; + short *qcoeff_ptr = d->qcoeff; + DECLARE_ALIGNED(16, short, x[16]); + DECLARE_ALIGNED(16, short, y[16]); + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + memset(qcoeff_ptr, 0, 32); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + _mm_store_si128((__m128i *)(x), x_minus_zbin0); + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + _mm_store_si128((__m128i *)(y), y0); + _mm_store_si128((__m128i *)(y + 8), y1); + + zbin_boost_ptr = b->zrun_zbin_boost; + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0); + SELECT_EOB(2, 1); + SELECT_EOB(3, 4); + SELECT_EOB(4, 8); + SELECT_EOB(5, 5); + SELECT_EOB(6, 2); + SELECT_EOB(7, 3); + SELECT_EOB(8, 6); + SELECT_EOB(9, 9); + SELECT_EOB(10, 12); + SELECT_EOB(11, 13); + SELECT_EOB(12, 10); + SELECT_EOB(13, 7); + SELECT_EOB(14, 11); + SELECT_EOB(15, 14); + SELECT_EOB(16, 15); + + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); + + /* dqcoeff = qcoeff * dequant */ + y0 = _mm_mullo_epi16(y0, dequant0); + y1 = _mm_mullo_epi16(y1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), y0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); + + *d->eob = eob; +} + +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); + __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); + + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* x = abs(y) = (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + xdq0 = _mm_mullo_epi16(x0, dequant0); + xdq1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); + + /* build a mask for the zig zag */ + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpeq_epi16(x0, zeros); + x1 = _mm_cmpeq_epi16(x1, zeros); + + ones = _mm_cmpeq_epi16(zeros, zeros); + + x0 = _mm_xor_si128(x0, ones); + x1 = _mm_xor_si128(x1, ones); + + x0 = _mm_and_si128(x0, inv_zig_zag0); + x1 = _mm_and_si128(x1, inv_zig_zag1); + + x0 = _mm_max_epi16(x0, x1); + + /* now down to 8 */ + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* only 4 left */ + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* okay, just 2! */ + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 + + x0 = _mm_max_epi16(x0, x1); + + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); +} diff --git a/media/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/vp8/encoder/x86/quantize_sse4.c new file mode 100644 index 000000000..601dd23a2 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <smmintrin.h> /* SSE4.1 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#define SELECT_EOB(i, z, x, y, q) \ + do { \ + short boost = *zbin_boost_ptr; \ + short x_z = _mm_extract_epi16(x, z); \ + short y_z = _mm_extract_epi16(y, z); \ + int cmp = (x_z < boost) | (y_z == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + break; \ + q = _mm_insert_epi16(q, y_z, z); \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } while (0) + +void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { + char eob = 0; + short *zbin_boost_ptr = b->zrun_zbin_boost; + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, + dqcoeff0, dqcoeff1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i qcoeff0 = _mm_setzero_si128(); + __m128i qcoeff1 = _mm_setzero_si128(); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + + _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); + + dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); + dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); + + *d->eob = eob; +} diff --git a/media/libvpx/vp8/encoder/x86/quantize_ssse3.c b/media/libvpx/vp8/encoder/x86/quantize_ssse3.c new file mode 100644 index 000000000..448217ff4 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_ssse3.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> /* SSSE3 */ + +#include "vp8/encoder/block.h" + +/* bitscan reverse (bsr) */ +#if defined(_MSC_VER) +#include <intrin.h> +#pragma intrinsic(_BitScanReverse) +static int bsr(int mask) { + int eob; + _BitScanReverse(&eob, mask); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#else +static int bsr(int mask) { + int eob; +#if defined(__GNUC__) && __GNUC__ + __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); +#endif + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#endif + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = + { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + eob = bsr(mask); + + *d->eob = 0xFF & eob; +} diff --git a/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm b/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm new file mode 100644 index 000000000..5964a85f2 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm @@ -0,0 +1,216 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse2) PRIVATE +sym(vp8_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse2) PRIVATE +sym(vp8_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/encoder/x86/subtract_mmx.asm b/media/libvpx/vp8/encoder/x86/subtract_mmx.asm new file mode 100644 index 000000000..794dd22f7 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/subtract_mmx.asm @@ -0,0 +1,223 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, +; short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_mmx_impl) PRIVATE +sym(vp8_subtract_b_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi], mm0 + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2],mm0 + + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) +global sym(vp8_subtract_mby_mmx) PRIVATE +sym(vp8_subtract_mby_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + push rbx + movsxd rbx, dword ptr arg(4);pred_stride + + pxor mm0, mm0 + mov rcx, 16 + + +.submby_loop: + movq mm1, [rsi] + movq mm3, [rax] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi], mm1 + movq [rdi+8], mm2 + + movq mm1, [rsi+8] + movq mm3, [rax+8] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi+16], mm1 + movq [rdi+24], mm2 + add rdi, 32 + lea rax, [rax+rbx] + lea rsi, [rsi+rdx] + dec rcx + jnz .submby_loop + + pop rbx + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) + +global sym(vp8_subtract_mbuv_mmx) PRIVATE +sym(vp8_subtract_mbuv_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 8 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + pxor mm7, mm7 + +.submbu_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 8 + +.submbv_loop: + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + add rdi, 16 + add rsi, rdx + add rax, rbx + + dec rcx + jnz .submbv_loop + + pop rbx + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/encoder/x86/subtract_sse2.asm b/media/libvpx/vp8/encoder/x86/subtract_sse2.asm new file mode 100644 index 000000000..a5d17f5be --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/subtract_sse2.asm @@ -0,0 +1,245 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, +; short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_sse2_impl) PRIVATE +sym(vp8_subtract_b_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi], mm0 + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*2], mm0 + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, +;unsigned char *pred, int pred_stride) +global sym(vp8_subtract_mby_sse2) PRIVATE +sym(vp8_subtract_mby_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;src + movsxd rdx, dword ptr arg(2);src_stride + mov rax, arg(3) ;pred + movdqa xmm4, [GLOBAL(t80)] + push rbx + mov rcx, 8 ; do two lines at one time + movsxd rbx, dword ptr arg(4);pred_stride + +.submby_loop: + movdqa xmm0, [rsi] ; src + movdqa xmm1, [rax] ; pred + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm1 ; put sign back to subtraction + + movdqa xmm3, [rsi + rdx] + movdqa xmm5, [rax + rbx] + + lea rsi, [rsi+rdx*2] + lea rax, [rax+rbx*2] + + movdqa [rdi], xmm0 + movdqa [rdi +16], xmm2 + + movdqa xmm1, xmm3 + psubb xmm3, xmm5 + + pxor xmm5, xmm4 ;convert to signed values + pxor xmm1, xmm4 + pcmpgtb xmm5, xmm1 ; obtain sign information + + movdqa xmm1, xmm3 + punpcklbw xmm3, xmm5 ; put sign back to subtraction + punpckhbw xmm1, xmm5 ; put sign back to subtraction + + movdqa [rdi +32], xmm3 + movdqa [rdi +48], xmm1 + + add rdi, 64 + dec rcx + jnz .submby_loop + + pop rbx + pop rdi + pop rsi + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, +; int src_stride, unsigned char *upred, +; unsigned char *vpred, int pred_stride) +global sym(vp8_subtract_mbuv_sse2) PRIVATE +sym(vp8_subtract_mbuv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movdqa xmm4, [GLOBAL(t80)] + mov rdi, arg(0) ;diff + mov rsi, arg(1) ;usrc + movsxd rdx, dword ptr arg(3);src_stride; + mov rax, arg(4) ;upred + add rdi, 256*2 ;diff = diff + 256 (shorts) + mov rcx, 4 + push rbx + movsxd rbx, dword ptr arg(6);pred_stride + + ;u +.submbu_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbu_loop + + mov rsi, arg(2) ;vsrc + mov rax, arg(5) ;vpred + mov rcx, 4 + + ;v +.submbv_loop: + movq xmm0, [rsi] ; src + movq xmm2, [rsi+rdx] ; src -- next line + movq xmm1, [rax] ; pred + movq xmm3, [rax+rbx] ; pred -- next line + lea rsi, [rsi + rdx*2] + lea rax, [rax + rbx*2] + + punpcklqdq xmm0, xmm2 + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, xmm4 ;convert to signed values + pxor xmm2, xmm4 + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa [rdi], xmm0 ; store difference + movdqa [rdi +16], xmm2 ; store difference + add rdi, 32 + sub rcx, 1 + jnz .submbv_loop + + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t80: + times 16 db 0x80 diff --git a/media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm new file mode 100644 index 000000000..bd92b398a --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -0,0 +1,207 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; void vp8_temporal_filter_apply_sse2 | arg +; (unsigned char *frame1, | 0 +; unsigned int stride, | 1 +; unsigned char *frame2, | 2 +; unsigned int block_size, | 3 +; int strength, | 4 +; int filter_weight, | 5 +; unsigned int *accumulator, | 6 +; unsigned short *count) | 7 +global sym(vp8_temporal_filter_apply_sse2) PRIVATE +sym(vp8_temporal_filter_apply_sse2): + + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ALIGN_STACK 16, rax + %define block_size 0 + %define strength 16 + %define filter_weight 32 + %define rounding_bit 48 + %define rbp_backup 64 + %define stack_size 80 + sub rsp, stack_size + mov [rsp + rbp_backup], rbp + ; end prolog + + mov rdx, arg(3) + mov [rsp + block_size], rdx + movd xmm6, arg(4) + movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read + + ; calculate the rounding bit outside the loop + ; 0x8000 >> (16 - strength) + mov rdx, 16 + sub rdx, arg(4) ; 16 - strength + movq xmm4, rdx ; can't use rdx w/ shift + movdqa xmm5, [GLOBAL(_const_top_bit)] + psrlw xmm5, xmm4 + movdqa [rsp + rounding_bit], xmm5 + + mov rsi, arg(0) ; src/frame1 + mov rdx, arg(2) ; predictor frame + mov rdi, arg(6) ; accumulator + mov rax, arg(7) ; count + + ; dup the filter weight and store for later + movd xmm0, arg(5) ; filter_weight + pshuflw xmm0, xmm0, 0 + punpcklwd xmm0, xmm0 + movdqa [rsp + filter_weight], xmm0 + + mov rbp, arg(1) ; stride + pxor xmm7, xmm7 ; zero for extraction + + lea rcx, [rdx + 16*16*1] + cmp dword ptr [rsp + block_size], 8 + jne .temporal_filter_apply_load_16 + lea rcx, [rdx + 8*8*1] + +.temporal_filter_apply_load_8: + movq xmm0, [rsi] ; first row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm0, xmm7 ; src[ 0- 7] + movq xmm1, [rsi] ; second row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm1, xmm7 ; src[ 8-15] + jmp .temporal_filter_apply_load_finished + +.temporal_filter_apply_load_16: + movdqa xmm0, [rsi] ; src (frame1) + lea rsi, [rsi + rbp] ; += stride + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; src[ 0- 7] + punpckhbw xmm1, xmm7 ; src[ 8-15] + +.temporal_filter_apply_load_finished: + movdqa xmm2, [rdx] ; predictor (frame2) + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm7 ; pred[ 0- 7] + punpckhbw xmm3, xmm7 ; pred[ 8-15] + + ; modifier = src_byte - pixel_value + psubw xmm0, xmm2 ; src - pred[ 0- 7] + psubw xmm1, xmm3 ; src - pred[ 8-15] + + ; modifier *= modifier + pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 + pmullw xmm1, xmm1 ; modifer[ 8-15]^2 + + ; modifier *= 3 + pmullw xmm0, [GLOBAL(_const_3w)] + pmullw xmm1, [GLOBAL(_const_3w)] + + ; modifer += 0x8000 >> (16 - strength) + paddw xmm0, [rsp + rounding_bit] + paddw xmm1, [rsp + rounding_bit] + + ; modifier >>= strength + psrlw xmm0, [rsp + strength] + psrlw xmm1, [rsp + strength] + + ; modifier = 16 - modifier + ; saturation takes care of modifier > 16 + movdqa xmm3, [GLOBAL(_const_16w)] + movdqa xmm2, [GLOBAL(_const_16w)] + psubusw xmm3, xmm1 + psubusw xmm2, xmm0 + + ; modifier *= filter_weight + pmullw xmm2, [rsp + filter_weight] + pmullw xmm3, [rsp + filter_weight] + + ; count + movdqa xmm4, [rax] + movdqa xmm5, [rax+16] + ; += modifier + paddw xmm4, xmm2 + paddw xmm5, xmm3 + ; write back + movdqa [rax], xmm4 + movdqa [rax+16], xmm5 + lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) + + ; load and extract the predictor up to shorts + pxor xmm7, xmm7 + movdqa xmm0, [rdx] + lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; pred[ 0- 7] + punpckhbw xmm1, xmm7 ; pred[ 8-15] + + ; modifier *= pixel_value + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + ; expand to double words + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm7 ; [ 0- 3] + punpckhwd xmm2, xmm7 ; [ 4- 7] + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm7 ; [ 8-11] + punpckhwd xmm3, xmm7 ; [12-15] + + ; accumulator + movdqa xmm4, [rdi] + movdqa xmm5, [rdi+16] + movdqa xmm6, [rdi+32] + movdqa xmm7, [rdi+48] + ; += modifier + paddd xmm4, xmm0 + paddd xmm5, xmm2 + paddd xmm6, xmm1 + paddd xmm7, xmm3 + ; write back + movdqa [rdi], xmm4 + movdqa [rdi+16], xmm5 + movdqa [rdi+32], xmm6 + movdqa [rdi+48], xmm7 + lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) + + cmp rdx, rcx + je .temporal_filter_apply_epilog + pxor xmm7, xmm7 ; zero for extraction + cmp dword ptr [rsp + block_size], 16 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 + +.temporal_filter_apply_epilog: + ; begin epilog + mov rbp, [rsp + rbp_backup] + add rsp, stack_size + pop rsp + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +_const_3w: + times 8 dw 3 +align 16 +_const_top_bit: + times 8 dw 1<<15 +align 16 +_const_16w + times 8 dw 16 diff --git a/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c new file mode 100644 index 000000000..cf3d8ca4a --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vp8/encoder/block.h" + +void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); +void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_mmx(input, output, pitch); + vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); +} + +int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr, short *dequant_ptr, + const short *scan_mask, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) +{ + const short *scan_mask = vp8_default_zig_zag_mask; + short *coeff_ptr = b->coeff; + short *zbin_ptr = b->zbin; + short *round_ptr = b->round; + short *quant_ptr = b->quant_fast; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = d->dequant; + + *d->eob = (char)vp8_fast_quantize_b_impl_mmx( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + scan_mask, + + round_ptr, + quant_ptr, + dqcoeff_ptr + ); +} + +int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_mmx(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); +} + +void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); +} diff --git a/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c new file mode 100644 index 000000000..3dfbee368 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vp8/encoder/block.h" + +int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_xmm(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); +} + +void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); +} |