; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" ; void av1_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 ; unsigned char *frame2, | 2 ; unsigned int block_width, | 3 ; unsigned int block_height, | 4 ; int strength, | 5 ; int filter_weight, | 6 ; unsigned int *accumulator, | 7 ; unsigned short *count) | 8 global sym(av1_temporal_filter_apply_sse2) PRIVATE sym(av1_temporal_filter_apply_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 SAVE_XMM 7 GET_GOT rbx push rsi push rdi ALIGN_STACK 16, rax %define block_width 0 %define block_height 16 %define strength 32 %define filter_weight 48 %define rounding_bit 64 %define rbp_backup 80 %define stack_size 96 sub rsp, stack_size mov [rsp + rbp_backup], rbp ; end prolog mov edx, arg(3) mov [rsp + block_width], rdx mov edx, arg(4) mov [rsp + block_height], rdx movd xmm6, arg(5) movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read ; calculate the rounding bit outside the loop ; 0x8000 >> (16 - strength) mov rdx, 16 sub rdx, arg(5) ; 16 - strength movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 movdqa [rsp + rounding_bit], xmm5 mov rsi, arg(0) ; src/frame1 mov rdx, arg(2) ; predictor frame mov rdi, arg(7) ; accumulator mov rax, arg(8) ; count ; dup the filter weight and store for later movd xmm0, arg(6) ; filter_weight pshuflw xmm0, xmm0, 0 punpcklwd xmm0, xmm0 movdqa [rsp + filter_weight], xmm0 mov rbp, arg(1) ; stride pxor xmm7, xmm7 ; zero for extraction mov rcx, [rsp + block_width] imul rcx, [rsp + block_height] add rcx, rdx cmp dword ptr [rsp + block_width], 8 jne .temporal_filter_apply_load_16 .temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm0, xmm7 ; src[ 0- 7] movq xmm1, [rsi] ; second row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm1, xmm7 ; src[ 8-15] jmp .temporal_filter_apply_load_finished .temporal_filter_apply_load_16: movdqa xmm0, [rsi] ; src (frame1) lea rsi, [rsi + rbp] ; += stride movdqa xmm1, xmm0 punpcklbw xmm0, xmm7 ; src[ 0- 7] punpckhbw xmm1, xmm7 ; src[ 8-15] .temporal_filter_apply_load_finished: movdqa xmm2, [rdx] ; predictor (frame2) movdqa xmm3, xmm2 punpcklbw xmm2, xmm7 ; pred[ 0- 7] punpckhbw xmm3, xmm7 ; pred[ 8-15] ; modifier = src_byte - pixel_value psubw xmm0, xmm2 ; src - pred[ 0- 7] psubw xmm1, xmm3 ; src - pred[ 8-15] ; modifier *= modifier pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 pmullw xmm1, xmm1 ; modifer[ 8-15]^2 ; modifier *= 3 pmullw xmm0, [GLOBAL(_const_3w)] pmullw xmm1, [GLOBAL(_const_3w)] ; modifer += 0x8000 >> (16 - strength) paddw xmm0, [rsp + rounding_bit] paddw xmm1, [rsp + rounding_bit] ; modifier >>= strength psrlw xmm0, [rsp + strength] psrlw xmm1, [rsp + strength] ; modifier = 16 - modifier ; saturation takes care of modifier > 16 movdqa xmm3, [GLOBAL(_const_16w)] movdqa xmm2, [GLOBAL(_const_16w)] psubusw xmm3, xmm1 psubusw xmm2, xmm0 ; modifier *= filter_weight pmullw xmm2, [rsp + filter_weight] pmullw xmm3, [rsp + filter_weight] ; count movdqa xmm4, [rax] movdqa xmm5, [rax+16] ; += modifier paddw xmm4, xmm2 paddw xmm5, xmm3 ; write back movdqa [rax], xmm4 movdqa [rax+16], xmm5 lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) ; load and extract the predictor up to shorts pxor xmm7, xmm7 movdqa xmm0, [rdx] lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) movdqa xmm1, xmm0 punpcklbw xmm0, xmm7 ; pred[ 0- 7] punpckhbw xmm1, xmm7 ; pred[ 8-15] ; modifier *= pixel_value pmullw xmm0, xmm2 pmullw xmm1, xmm3 ; expand to double words movdqa xmm2, xmm0 punpcklwd xmm0, xmm7 ; [ 0- 3] punpckhwd xmm2, xmm7 ; [ 4- 7] movdqa xmm3, xmm1 punpcklwd xmm1, xmm7 ; [ 8-11] punpckhwd xmm3, xmm7 ; [12-15] ; accumulator movdqa xmm4, [rdi] movdqa xmm5, [rdi+16] movdqa xmm6, [rdi+32] movdqa xmm7, [rdi+48] ; += modifier paddd xmm4, xmm0 paddd xmm5, xmm2 paddd xmm6, xmm1 paddd xmm7, xmm3 ; write back movdqa [rdi], xmm4 movdqa [rdi+16], xmm5 movdqa [rdi+32], xmm6 movdqa [rdi+48], xmm7 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) cmp rdx, rcx je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction cmp dword ptr [rsp + block_width], 16 je .temporal_filter_apply_load_16 jmp .temporal_filter_apply_load_8 .temporal_filter_apply_epilog: ; begin epilog mov rbp, [rsp + rbp_backup] add rsp, stack_size pop rsp pop rdi pop rsi RESTORE_GOT RESTORE_XMM UNSHADOW_ARGS pop rbp ret SECTION_RODATA align 16 _const_3w: times 8 dw 3 align 16 _const_top_bit: times 8 dw 1<<15 align 16 _const_16w: times 8 dw 16