; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" %macro PROCESS_16X2X3 1 %if %1 movdqa xmm0, XMMWORD PTR [rsi] lddqu xmm5, XMMWORD PTR [rdi] lddqu xmm6, XMMWORD PTR [rdi+1] lddqu xmm7, XMMWORD PTR [rdi+2] psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else movdqa xmm0, XMMWORD PTR [rsi] lddqu xmm1, XMMWORD PTR [rdi] lddqu xmm2, XMMWORD PTR [rdi+1] lddqu xmm3, XMMWORD PTR [rdi+2] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 paddw xmm7, xmm3 %endif movdqa xmm0, XMMWORD PTR [rsi+rax] lddqu xmm1, XMMWORD PTR [rdi+rdx] lddqu xmm2, XMMWORD PTR [rdi+rdx+1] lddqu xmm3, XMMWORD PTR [rdi+rdx+2] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 paddw xmm7, xmm3 %endmacro %macro PROCESS_16X2X3_OFFSET 2 %if %1 movdqa xmm0, XMMWORD PTR [rsi] movdqa xmm4, XMMWORD PTR [rdi] movdqa xmm7, XMMWORD PTR [rdi+16] movdqa xmm5, xmm7 palignr xmm5, xmm4, %2 movdqa xmm6, xmm7 palignr xmm6, xmm4, (%2+1) palignr xmm7, xmm4, (%2+2) psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else movdqa xmm0, XMMWORD PTR [rsi] movdqa xmm4, XMMWORD PTR [rdi] movdqa xmm3, XMMWORD PTR [rdi+16] movdqa xmm1, xmm3 palignr xmm1, xmm4, %2 movdqa xmm2, xmm3 palignr xmm2, xmm4, (%2+1) palignr xmm3, xmm4, (%2+2) psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 paddw xmm7, xmm3 %endif movdqa xmm0, XMMWORD PTR [rsi+rax] movdqa xmm4, XMMWORD PTR [rdi+rdx] movdqa xmm3, XMMWORD PTR [rdi+rdx+16] movdqa xmm1, xmm3 palignr xmm1, xmm4, %2 movdqa xmm2, xmm3 palignr xmm2, xmm4, (%2+1) palignr xmm3, xmm4, (%2+2) lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 paddw xmm7, xmm3 %endmacro %macro PROCESS_16X16X3_OFFSET 2 %2_aligned_by_%1: sub rdi, %1 PROCESS_16X2X3_OFFSET 1, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 jmp %2_store_off %endmacro %macro PROCESS_16X8X3_OFFSET 2 %2_aligned_by_%1: sub rdi, %1 PROCESS_16X2X3_OFFSET 1, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 PROCESS_16X2X3_OFFSET 0, %1 jmp %2_store_off %endmacro ;void int aom_sad16x16x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) global sym(aom_sad16x16x3_ssse3) PRIVATE sym(aom_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 push rsi push rdi push rcx ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr mov rdx, 0xf and rdx, rdi jmp .aom_sad16x16x3_ssse3_skiptable .aom_sad16x16x3_ssse3_jumptable: dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump .aom_sad16x16x3_ssse3_skiptable: call .aom_sad16x16x3_ssse3_do_jump .aom_sad16x16x3_ssse3_do_jump: pop rcx ; get the address of do_jump mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable add rcx, rax movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride jmp rcx PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 .aom_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 .aom_sad16x16x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 psrldq xmm5, 8 paddw xmm0, xmm5 movd [rdi], xmm0 ;- movq xmm0, xmm6 psrldq xmm6, 8 paddw xmm0, xmm6 movd [rdi+4], xmm0 ;- movq xmm0, xmm7 psrldq xmm7, 8 paddw xmm0, xmm7 movd [rdi+8], xmm0 ; begin epilog pop rcx pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void int aom_sad16x8x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) global sym(aom_sad16x8x3_ssse3) PRIVATE sym(aom_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 push rsi push rdi push rcx ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr mov rdx, 0xf and rdx, rdi jmp .aom_sad16x8x3_ssse3_skiptable .aom_sad16x8x3_ssse3_jumptable: dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump .aom_sad16x8x3_ssse3_skiptable: call .aom_sad16x8x3_ssse3_do_jump .aom_sad16x8x3_ssse3_do_jump: pop rcx ; get the address of do_jump mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable add rcx, rax movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride jmp rcx PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 .aom_sad16x8x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 .aom_sad16x8x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 psrldq xmm5, 8 paddw xmm0, xmm5 movd [rdi], xmm0 ;- movq xmm0, xmm6 psrldq xmm6, 8 paddw xmm0, xmm6 movd [rdi+4], xmm0 ;- movq xmm0, xmm7 psrldq xmm7, 8 paddw xmm0, xmm7 movd [rdi+8], xmm0 ; begin epilog pop rcx pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret