diff options
Diffstat (limited to 'media/libvpx/vp8/common/x86/postproc_sse2.asm')
-rw-r--r-- | media/libvpx/vp8/common/x86/postproc_sse2.asm | 723 |
1 files changed, 723 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/x86/postproc_sse2.asm b/media/libvpx/vp8/common/x86/postproc_sse2.asm new file mode 100644 index 000000000..fed4ee5cc --- /dev/null +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm @@ -0,0 +1,723 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;macro in deblock functions +%macro FIRST_2_ROWS 0 + movdqa xmm4, xmm0 + movdqa xmm6, xmm0 + movdqa xmm5, xmm1 + pavgb xmm5, xmm3 + + ;calculate absolute value + psubusb xmm4, xmm1 + psubusb xmm1, xmm0 + psubusb xmm6, xmm3 + psubusb xmm3, xmm0 + paddusb xmm4, xmm1 + paddusb xmm6, xmm3 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm7, xmm2 + + ;get mask + psubusb xmm2, xmm4 + psubusb xmm7, xmm6 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm7, xmm1 + por xmm7, xmm2 +%endmacro + +%macro SECOND_2_ROWS 0 + movdqa xmm6, xmm0 + movdqa xmm4, xmm0 + movdqa xmm2, xmm1 + pavgb xmm1, xmm3 + + ;calculate absolute value + psubusb xmm6, xmm2 + psubusb xmm2, xmm0 + psubusb xmm4, xmm3 + psubusb xmm3, xmm0 + paddusb xmm6, xmm2 + paddusb xmm4, xmm3 + + pavgb xmm5, xmm1 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm3, xmm2 + + ;get mask + psubusb xmm2, xmm6 + psubusb xmm3, xmm4 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm3, xmm1 + + por xmm7, xmm2 + por xmm7, xmm3 + + pavgb xmm5, xmm0 + + ;decide if or not to use filtered value + pand xmm0, xmm7 + pandn xmm7, xmm5 + paddusb xmm0, xmm7 +%endmacro + +%macro UPDATE_FLIMIT 0 + movdqa xmm2, XMMWORD PTR [rbx] + movdqa [rsp], xmm2 + add rbx, 16 +%endmacro + +;void vp8_post_proc_down_and_across_mb_row_sse2 +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int cols, +; int *flimits, +; int size +;) +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE +sym(vp8_post_proc_down_and_across_mb_row_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 + + ; put flimit on stack + mov rbx, arg(5) ;flimits ptr + UPDATE_FLIMIT + +%define flimit [rsp] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock +.nextrow: + xor rdx, rdx ;col +.nextcol: + ;load current and next 2 rows + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] + + FIRST_2_ROWS + + ;load above 2 rows + neg rax + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] + movdqu xmm3, XMMWORD PTR [rsi + rax] + + SECOND_2_ROWS + + movdqu XMMWORD PTR [rdi], xmm0 + + neg rax ; positive stride + add rsi, 16 + add rdi, 16 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .downdone + UPDATE_FLIMIT + jmp .nextcol + +.downdone: + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + mov rbx, arg(5) ; flimits + UPDATE_FLIMIT + + ; dup the first byte into the left border 8 times + movq mm1, [rdi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + mov rdx, -8 + movq [rdi+rdx], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(4) + movq mm1, [rdi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rdi+rdx], mm1 + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-16]; + movq mm1, QWORD PTR [rdi-8]; + +.acrossnextcol: + movdqu xmm0, XMMWORD PTR [rdi + rdx] + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] + + FIRST_2_ROWS + + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] + + SECOND_2_ROWS + + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes + movdq2q mm0, xmm0 + psrldq xmm0, 8 + movdq2q mm1, xmm0 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .acrossdone + UPDATE_FLIMIT + jmp .acrossnextcol + +.acrossdone + ; last 16 pixels + movq QWORD PTR [rdi+rdx-16], mm0 + + cmp edx, dword arg(4) + jne .throw_last_8 + movq QWORD PTR [rdi+rdx-8], mm1 +.throw_last_8: + ; done with this rwo + add rsi,rax ;next src line + mov eax, dword arg(3) ;dst_pixels_per_line + add rdi,rax ;next destination + mov eax, dword arg(2) ;src_pixels_per_line + + mov rbx, arg(5) ;flimits + UPDATE_FLIMIT + + dec rcx ;decrement count + jnz .nextrow ;next row + + add rsp, 16 + pop rsp + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit + +;void vp8_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_xmm) PRIVATE +sym(vp8_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp8_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c<cols; c+=8) +.loop_col: + mov rsi, arg(0) ; s + pxor xmm0, xmm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + + ; this copies the last row down into the border 8 rows + mov rdi, rsi + mov rdx, arg(2) + sub rdx, 9 + imul rdx, rax + lea rdi, [rdi+rdx] + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_borderd ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_borderd + + neg rax ; rax = -pitch + + ; this copies the first row up into the border 8 rows + mov rdi, rsi + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_border ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_border + + + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movq xmm1, QWORD PTR [rdi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [rsi+rax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp8_rv))] + movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movdqu xmm4, [sym(vp8_rv) + rcx*2] +%endif + + paddw xmm1, xmm4 + ;paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and rcx, 15 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + + cmp edx, 8 + jl .skip_assignment + + mov rcx, rdx + sub rcx, 8 + and rcx, 15 + movq mm0, [rsp + rcx*8] ;d[rcx*8] + movq [rsi], mm0 + +.skip_assignment + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + add dword arg(0), 8 ; s += 8 + sub dword arg(3), 8 ; cols -= 8 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 128+16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE +sym(vp8_mbpost_proc_across_ip_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r<rows;r++) +.ip_row_loop: + + xor rdx, rdx ;sumsq=0; + xor rcx, rcx ;sum=0; + mov rsi, arg(0); s + + + ; dup the first byte into the left border 8 times + movq mm1, [rsi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + + mov rdi, -8 + movq [rsi+rdi], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(3) + movq mm1, [rsi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rsi+rdx], mm1 + +.ip_var_loop: + ;for(i=-8;i<=6;i++) + ;{ + ; sumsq += s[i]*s[i]; + ; sum += s[i]; + ;} + movzx eax, byte [rsi+rdi] + add ecx, eax + mul al + add edx, eax + add rdi, 1 + cmp rdi, 6 + jle .ip_var_loop + + + ;mov rax, sumsq + ;movd xmm7, rax + movd xmm7, edx + + ;mov rax, sum + ;movd xmm6, rax + movd xmm6, ecx + + mov rsi, arg(0) ;s + xor rcx, rcx + + movsxd rdx, dword arg(3) ;cols + add rdx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 +.nextcol4: + + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 ; expanding + punpcklbw xmm2, xmm0 ; expanding + + punpcklwd xmm1, xmm0 ; expanding to dwords + punpcklwd xmm2, xmm0 ; expanding to dwords + + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 ; duplicate the last ones + pshufd xmm7, xmm7, 0 ; duplicate the last ones + + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [rsi+rcx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, [GLOBAL(four8s)] + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [rsi+rcx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add rcx, 4 + + cmp rcx, rdx + jl .nextcol4 + + ;s+=pitch; + movsxd rax, dword arg(1) + add arg(0), rax + + sub dword arg(2), 1 ;rows-=1 + cmp dword arg(2), 0 + jg .ip_row_loop + + add rsp, 16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vp8_plane_add_noise_wmt) PRIVATE +sym(vp8_plane_add_noise_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +four8s: + times 4 dd 8 |