diff options
Diffstat (limited to 'media/libvpx/vpx_dsp/x86/sad_mmx.asm')
-rw-r--r-- | media/libvpx/vpx_dsp/x86/sad_mmx.asm | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/media/libvpx/vpx_dsp/x86/sad_mmx.asm b/media/libvpx/vpx_dsp/x86/sad_mmx.asm new file mode 100644 index 000000000..9968992bd --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_mmx.asm @@ -0,0 +1,427 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vpx_sad16x16_mmx) PRIVATE +global sym(vpx_sad8x16_mmx) PRIVATE +global sym(vpx_sad8x8_mmx) PRIVATE +global sym(vpx_sad4x4_mmx) PRIVATE +global sym(vpx_sad16x8_mmx) PRIVATE + +;unsigned int vpx_sad16x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddw mm0, mm2 + paddw mm1, mm3 + + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + paddw mm7, mm1 + + cmp rsi, rcx + jne .x16x16sad_mmx_loop + + + movq mm0, mm7 + + punpcklwd mm0, mm6 + punpckhwd mm7, mm6 + + paddw mm0, mm7 + movq mm7, mm0 + + + psrlq mm0, 32 + paddw mm7, mm0 + + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + paddw mm7, mm2 + cmp rsi, rcx + + jne .x8x16sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x8sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + paddw mm0, mm2 + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + cmp rsi, rcx + + jne .x8x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad4x4_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] + + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + pxor mm3, mm3 + + punpcklbw mm0, mm3 + punpckhbw mm2, mm3 + + paddw mm0, mm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm4, DWORD PTR [rsi] + movd mm5, DWORD PTR [rdi] + + movd mm6, DWORD PTR [rsi+rax] + movd mm7, DWORD PTR [rdi+rdx] + + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + + movq mm6, mm4 + psubusb mm4, mm5 + + psubusb mm5, mm6 + por mm4, mm5 + + movq mm5, mm4 + punpcklbw mm4, mm3 + + punpckhbw mm5, mm3 + paddw mm4, mm5 + + paddw mm0, mm4 + movq mm1, mm0 + + punpcklwd mm0, mm3 + punpckhwd mm1, mm3 + + paddw mm0, mm1 + movq mm1, mm0 + + psrlq mm0, 32 + paddw mm0, mm1 + + movq rax, mm0 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad16x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x8sad_mmx_loop: + + movq mm0, [rsi] + movq mm1, [rdi] + + movq mm2, [rsi+8] + movq mm3, [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpckhbw mm1, mm6 + + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + + + paddw mm0, mm2 + paddw mm1, mm3 + + paddw mm0, mm1 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne .x16x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret |