; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_64: times 8 dw 64 even_byte_mask: times 8 dw 0x00ff ; %define USE_PMULHRSW ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss ; when using this instruction. ; ; The add order below (based on ffav1) must be followed to prevent outranges. ; x = k0k1 + k4k5 ; y = k2k3 + k6k7 ; z = signed SAT(x + y) SECTION .text %define LOCAL_VARS_SIZE 16*6 %macro SETUP_LOCAL_VARS 0 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + ; pmaddubsw has a higher latency on some platforms, this might be eased by ; interleaving the instructions. %define k0k1 [rsp + 16*0] %define k2k3 [rsp + 16*1] %define k4k5 [rsp + 16*2] %define k6k7 [rsp + 16*3] packsswb m4, m4 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on ; some platforms. pshuflw m0, m4, 0b ;k0_k1 pshuflw m1, m4, 01010101b ;k2_k3 pshuflw m2, m4, 10101010b ;k4_k5 pshuflw m3, m4, 11111111b ;k6_k7 punpcklqdq m0, m0 punpcklqdq m1, m1 punpcklqdq m2, m2 punpcklqdq m3, m3 mova k0k1, m0 mova k2k3, m1 mova k4k5, m2 mova k6k7, m3 %if ARCH_X86_64 %define krd m12 %define tmp0 [rsp + 16*4] %define tmp1 [rsp + 16*5] mova krd, [GLOBAL(pw_64)] %else %define krd [rsp + 16*4] %if CONFIG_PIC=0 mova m6, [GLOBAL(pw_64)] %else ; build constants without accessing global memory pcmpeqb m6, m6 ;all ones psrlw m6, 15 psllw m6, 6 ;aka pw_64 %endif mova krd, m6 %endif %endm ;------------------------------------------------------------------------------- %if ARCH_X86_64 %define LOCAL_VARS_SIZE_H4 0 %else %define LOCAL_VARS_SIZE_H4 16*4 %endif %macro SUBPIX_HFILTER4 1 cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] packsswb m4, m4 %if ARCH_X86_64 %define k0k1k4k5 m8 %define k2k3k6k7 m9 %define krd m10 mova krd, [GLOBAL(pw_64)] pshuflw k0k1k4k5, m4, 0b ;k0_k1 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 %else %define k0k1k4k5 [rsp + 16*0] %define k2k3k6k7 [rsp + 16*1] %define krd [rsp + 16*2] pshuflw m6, m4, 0b ;k0_k1 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 pshuflw m7, m4, 01010101b ;k2_k3 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 %if CONFIG_PIC=0 mova m1, [GLOBAL(pw_64)] %else ; build constants without accessing global memory pcmpeqb m1, m1 ;all ones psrlw m1, 15 psllw m1, 6 ;aka pw_64 %endif mova k0k1k4k5, m6 mova k2k3k6k7, m7 mova krd, m1 %endif dec heightd .loop: ;Do two rows at once movu m4, [srcq - 3] movu m5, [srcq + sstrideq - 3] punpckhbw m1, m4, m4 punpcklbw m4, m4 punpckhbw m3, m5, m5 punpcklbw m5, m5 palignr m0, m1, m4, 1 pmaddubsw m0, k0k1k4k5 palignr m1, m4, 5 pmaddubsw m1, k2k3k6k7 palignr m2, m3, m5, 1 pmaddubsw m2, k0k1k4k5 palignr m3, m5, 5 pmaddubsw m3, k2k3k6k7 punpckhqdq m4, m0, m2 punpcklqdq m0, m2 punpckhqdq m5, m1, m3 punpcklqdq m1, m3 paddsw m0, m4 paddsw m1, m5 %ifidn %1, h8_avg movd m4, [dstq] movd m5, [dstq + dstrideq] %endif paddsw m0, m1 paddsw m0, krd psraw m0, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] movu m5, [srcq + sstrideq] punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 punpcklbw m4, m3 paddsw m0, m4 %endif packuswb m0, m0 psrldq m1, m0, 4 %ifidn %1, h8_avg pavgb m0, m4 pavgb m1, m5 %endif movd [dstq], m0 movd [dstq + dstrideq], m1 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] lea srcq, [srcq + sstrideq ] lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m4, [srcq - 3] punpckhbw m1, m4, m4 punpcklbw m4, m4 palignr m0, m1, m4, 1 palignr m1, m4, 5 pmaddubsw m0, k0k1k4k5 pmaddubsw m1, k2k3k6k7 psrldq m2, m0, 8 psrldq m3, m1, 8 paddsw m0, m2 paddsw m1, m3 paddsw m0, m1 paddsw m0, krd psraw m0, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] punpcklbw m4, m3 paddsw m0, m4 %endif packuswb m0, m0 %ifidn %1, h8_avg movd m4, [dstq] pavgb m0, m4 %endif movd [dstq], m0 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER8 1 cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS dec heightd .loop: ;Do two rows at once movu m0, [srcq - 3] movu m4, [srcq + sstrideq - 3] punpckhbw m1, m0, m0 punpcklbw m0, m0 palignr m5, m1, m0, 13 pmaddubsw m5, k6k7 palignr m2, m1, m0, 5 palignr m3, m1, m0, 9 palignr m1, m0, 1 pmaddubsw m1, k0k1 punpckhbw m6, m4, m4 punpcklbw m4, m4 pmaddubsw m2, k2k3 pmaddubsw m3, k4k5 palignr m7, m6, m4, 13 palignr m0, m6, m4, 5 pmaddubsw m7, k6k7 paddsw m1, m3 paddsw m2, m5 paddsw m1, m2 %ifidn %1, h8_avg movh m2, [dstq] movhps m2, [dstq + dstrideq] %endif palignr m5, m6, m4, 9 palignr m6, m4, 1 pmaddubsw m0, k2k3 pmaddubsw m6, k0k1 paddsw m1, krd pmaddubsw m5, k4k5 psraw m1, 7 paddsw m0, m7 paddsw m6, m5 paddsw m6, m0 paddsw m6, krd psraw m6, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] movu m5, [srcq + sstrideq] punpcklbw m4, m3 punpcklbw m5, m3 paddsw m1, m4 paddsw m6, m5 %endif packuswb m1, m6 %ifidn %1, h8_avg pavgb m1, m2 %endif movh [dstq], m1 movhps [dstq + dstrideq], m1 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] lea srcq, [srcq + sstrideq ] lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m0, [srcq - 3] punpckhbw m3, m0, m0 punpcklbw m0, m0 palignr m1, m3, m0, 1 palignr m2, m3, m0, 5 palignr m4, m3, m0, 13 palignr m3, m0, 9 pmaddubsw m1, k0k1 pmaddubsw m2, k2k3 pmaddubsw m3, k4k5 pmaddubsw m4, k6k7 paddsw m1, m3 paddsw m4, m2 paddsw m1, m4 paddsw m1, krd psraw m1, 7 %ifidn %1, h8_add_src pxor m6, m6 movu m5, [srcq] punpcklbw m5, m6 paddsw m1, m5 %endif packuswb m1, m1 %ifidn %1, h8_avg movh m0, [dstq] pavgb m1, m0 %endif movh [dstq], m1 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER16 1 cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS .loop: prefetcht0 [srcq + 2 * sstrideq -3] movu m0, [srcq - 3] movu m4, [srcq - 2] pmaddubsw m0, k0k1 pmaddubsw m4, k0k1 movu m1, [srcq - 1] movu m5, [srcq + 0] pmaddubsw m1, k2k3 pmaddubsw m5, k2k3 movu m2, [srcq + 1] movu m6, [srcq + 2] pmaddubsw m2, k4k5 pmaddubsw m6, k4k5 movu m3, [srcq + 3] movu m7, [srcq + 4] pmaddubsw m3, k6k7 pmaddubsw m7, k6k7 paddsw m0, m2 paddsw m1, m3 paddsw m0, m1 paddsw m4, m6 paddsw m5, m7 paddsw m4, m5 paddsw m0, krd paddsw m4, krd psraw m0, 7 psraw m4, 7 %ifidn %1, h8_add_src %if ARCH_X86=1 && CONFIG_PIC=1 pcmpeqb m2, m2 ;all ones psrlw m2, 8 ;even_byte_mask %else mova m2, [GLOBAL(even_byte_mask)] %endif movu m5, [srcq] mova m7, m5 pand m5, m2 psrlw m7, 8 paddsw m0, m5 paddsw m4, m7 %endif packuswb m0, m0 packuswb m4, m4 punpcklbw m0, m4 %ifidn %1, h8_avg pavgb m0, [dstq] %endif lea srcq, [srcq + sstrideq] mova [dstq], m0 lea dstq, [dstq + dstrideq] dec heightd jnz .loop REP_RET %endm INIT_XMM ssse3 SUBPIX_HFILTER16 h8 SUBPIX_HFILTER8 h8 SUBPIX_HFILTER4 h8 ;------------------------------------------------------------------------------- ; TODO(Linfeng): Detect cpu type and choose the code with better performance. %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 %if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %define NUM_GENERAL_REG_USED 9 %else %define NUM_GENERAL_REG_USED 6 %endif %macro SUBPIX_VFILTER 2 cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS %ifidn %2, 8 %define movx movh %else %define movx movd %endif dec heightd %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %if ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq %else %define src1q filterq %define sstride6q dstrideq %define dst_stride dstridemp %endif mov src1q, srcq add src1q, sstrideq lea sstride6q, [sstrideq + sstrideq * 4] add sstride6q, sstrideq ;pitch * 6 .loop: ;Do two rows at once movx m0, [srcq ] ;A movx m1, [src1q ] ;B punpcklbw m0, m1 ;A B movx m2, [srcq + sstrideq * 2 ] ;C pmaddubsw m0, k0k1 mova m6, m2 movx m3, [src1q + sstrideq * 2] ;D punpcklbw m2, m3 ;C D pmaddubsw m2, k2k3 movx m4, [srcq + sstrideq * 4 ] ;E mova m7, m4 movx m5, [src1q + sstrideq * 4] ;F punpcklbw m4, m5 ;E F pmaddubsw m4, k4k5 punpcklbw m1, m6 ;A B next iter movx m6, [srcq + sstride6q ] ;G punpcklbw m5, m6 ;E F next iter punpcklbw m3, m7 ;C D next iter pmaddubsw m5, k4k5 movx m7, [src1q + sstride6q ] ;H punpcklbw m6, m7 ;G H pmaddubsw m6, k6k7 pmaddubsw m3, k2k3 pmaddubsw m1, k0k1 paddsw m0, m4 paddsw m2, m6 movx m6, [srcq + sstrideq * 8 ] ;H next iter punpcklbw m7, m6 pmaddubsw m7, k6k7 paddsw m0, m2 paddsw m0, krd psraw m0, 7 paddsw m1, m5 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [srcq] punpcklbw m4, m6 paddsw m0, m4 %endif packuswb m0, m0 paddsw m3, m7 paddsw m1, m3 paddsw m1, krd psraw m1, 7 %ifidn %1, v8_add_src movu m4, [src1q] punpcklbw m4, m6 paddsw m1, m4 %endif lea srcq, [srcq + sstrideq * 2 ] lea src1q, [src1q + sstrideq * 2] packuswb m1, m1 %ifidn %1, v8_avg movx m2, [dstq] pavgb m0, m2 %endif movx [dstq], m0 add dstq, dst_stride %ifidn %1, v8_avg movx m3, [dstq] pavgb m1, m3 %endif movx [dstq], m1 add dstq, dst_stride sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movx m0, [srcq ] ;A movx m1, [srcq + sstrideq ] ;B movx m6, [srcq + sstride6q ] ;G punpcklbw m0, m1 ;A B movx m7, [src1q + sstride6q ] ;H pmaddubsw m0, k0k1 movx m2, [srcq + sstrideq * 2 ] ;C punpcklbw m6, m7 ;G H movx m3, [src1q + sstrideq * 2] ;D pmaddubsw m6, k6k7 movx m4, [srcq + sstrideq * 4 ] ;E punpcklbw m2, m3 ;C D movx m5, [src1q + sstrideq * 4] ;F punpcklbw m4, m5 ;E F pmaddubsw m2, k2k3 pmaddubsw m4, k4k5 paddsw m2, m6 paddsw m0, m4 paddsw m0, m2 paddsw m0, krd psraw m0, 7 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [srcq] punpcklbw m4, m6 paddsw m0, m4 %endif packuswb m0, m0 %ifidn %1, v8_avg movx m1, [dstq] pavgb m0, m1 %endif movx [dstq], m0 %else ; ARCH_X86_64 movx m0, [srcq ] ;A movx m1, [srcq + sstrideq ] ;B lea srcq, [srcq + sstrideq * 2 ] movx m2, [srcq] ;C movx m3, [srcq + sstrideq] ;D lea srcq, [srcq + sstrideq * 2 ] movx m4, [srcq] ;E movx m5, [srcq + sstrideq] ;F lea srcq, [srcq + sstrideq * 2 ] movx m6, [srcq] ;G punpcklbw m0, m1 ;A B punpcklbw m1, m2 ;A B next iter punpcklbw m2, m3 ;C D punpcklbw m3, m4 ;C D next iter punpcklbw m4, m5 ;E F punpcklbw m5, m6 ;E F next iter .loop: ;Do two rows at once movx m7, [srcq + sstrideq] ;H lea srcq, [srcq + sstrideq * 2 ] movx m14, [srcq] ;H next iter punpcklbw m6, m7 ;G H punpcklbw m7, m14 ;G H next iter pmaddubsw m8, m0, k0k1 pmaddubsw m9, m1, k0k1 mova m0, m2 mova m1, m3 pmaddubsw m10, m2, k2k3 pmaddubsw m11, m3, k2k3 mova m2, m4 mova m3, m5 pmaddubsw m4, k4k5 pmaddubsw m5, k4k5 paddsw m8, m4 paddsw m9, m5 mova m4, m6 mova m5, m7 pmaddubsw m6, k6k7 pmaddubsw m7, k6k7 paddsw m10, m6 paddsw m11, m7 paddsw m8, m10 paddsw m9, m11 mova m6, m14 paddsw m8, krd paddsw m9, krd psraw m8, 7 psraw m9, 7 %ifidn %2, 4 packuswb m8, m8 packuswb m9, m9 %else packuswb m8, m9 %endif %ifidn %1, v8_avg movx m7, [dstq] %ifidn %2, 4 movx m10, [dstq + dstrideq] pavgb m9, m10 %else movhpd m7, [dstq + dstrideq] %endif pavgb m8, m7 %endif movx [dstq], m8 %ifidn %2, 4 movx [dstq + dstrideq], m9 %else movhpd [dstq + dstrideq], m8 %endif lea dstq, [dstq + dstrideq * 2 ] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movx m7, [srcq + sstrideq] ;H punpcklbw m6, m7 ;G H pmaddubsw m0, k0k1 pmaddubsw m2, k2k3 pmaddubsw m4, k4k5 pmaddubsw m6, k6k7 paddsw m0, m4 paddsw m2, m6 paddsw m0, m2 paddsw m0, krd psraw m0, 7 packuswb m0, m0 %ifidn %1, v8_avg movx m1, [dstq] pavgb m0, m1 %endif movx [dstq], m0 %endif ; ARCH_X86_64 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_VFILTER16 1 cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %if ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq %else %define src1q filterq %define sstride6q dstrideq %define dst_stride dstridemp %endif lea src1q, [srcq + sstrideq] lea sstride6q, [sstrideq + sstrideq * 4] add sstride6q, sstrideq ;pitch * 6 .loop: movh m0, [srcq ] ;A movh m1, [src1q ] ;B movh m2, [srcq + sstrideq * 2 ] ;C movh m3, [src1q + sstrideq * 2] ;D movh m4, [srcq + sstrideq * 4 ] ;E movh m5, [src1q + sstrideq * 4] ;F punpcklbw m0, m1 ;A B movh m6, [srcq + sstride6q] ;G punpcklbw m2, m3 ;C D movh m7, [src1q + sstride6q] ;H punpcklbw m4, m5 ;E F pmaddubsw m0, k0k1 movh m3, [srcq + 8] ;A pmaddubsw m2, k2k3 punpcklbw m6, m7 ;G H movh m5, [srcq + sstrideq + 8] ;B pmaddubsw m4, k4k5 punpcklbw m3, m5 ;A B movh m7, [srcq + sstrideq * 2 + 8] ;C pmaddubsw m6, k6k7 movh m5, [src1q + sstrideq * 2 + 8] ;D punpcklbw m7, m5 ;C D paddsw m2, m6 pmaddubsw m3, k0k1 movh m1, [srcq + sstrideq * 4 + 8] ;E paddsw m0, m4 pmaddubsw m7, k2k3 movh m6, [src1q + sstrideq * 4 + 8] ;F punpcklbw m1, m6 ;E F paddsw m0, m2 paddsw m0, krd movh m2, [srcq + sstride6q + 8] ;G pmaddubsw m1, k4k5 movh m5, [src1q + sstride6q + 8] ;H psraw m0, 7 punpcklbw m2, m5 ;G H pmaddubsw m2, k6k7 paddsw m7, m2 paddsw m3, m1 paddsw m3, m7 paddsw m3, krd psraw m3, 7 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down mova m5, m4 punpcklbw m4, m6 punpckhbw m5, m6 paddsw m0, m4 paddsw m3, m5 %endif packuswb m0, m3 add srcq, sstrideq add src1q, sstrideq %ifidn %1, v8_avg pavgb m0, [dstq] %endif mova [dstq], m0 add dstq, dst_stride dec heightd jnz .loop REP_RET %else ; ARCH_X86_64 dec heightd movu m1, [srcq ] ;A movu m3, [srcq + sstrideq ] ;B lea srcq, [srcq + sstrideq * 2] punpcklbw m0, m1, m3 ;A B punpckhbw m1, m3 ;A B movu m5, [srcq] ;C punpcklbw m2, m3, m5 ;A B next iter punpckhbw m3, m5 ;A B next iter mova tmp0, m2 ;store to stack mova tmp1, m3 ;store to stack movu m7, [srcq + sstrideq] ;D lea srcq, [srcq + sstrideq * 2] punpcklbw m4, m5, m7 ;C D punpckhbw m5, m7 ;C D movu m9, [srcq] ;E punpcklbw m6, m7, m9 ;C D next iter punpckhbw m7, m9 ;C D next iter movu m11, [srcq + sstrideq] ;F lea srcq, [srcq + sstrideq * 2] punpcklbw m8, m9, m11 ;E F punpckhbw m9, m11 ;E F movu m2, [srcq] ;G punpcklbw m10, m11, m2 ;E F next iter punpckhbw m11, m2 ;E F next iter .loop: ;Do two rows at once pmaddubsw m13, m0, k0k1 mova m0, m4 pmaddubsw m14, m8, k4k5 pmaddubsw m15, m4, k2k3 mova m4, m8 paddsw m13, m14 movu m3, [srcq + sstrideq] ;H lea srcq, [srcq + sstrideq * 2] punpcklbw m14, m2, m3 ;G H mova m8, m14 pmaddubsw m14, k6k7 paddsw m15, m14 paddsw m13, m15 paddsw m13, krd psraw m13, 7 pmaddubsw m14, m1, k0k1 pmaddubsw m1, m9, k4k5 pmaddubsw m15, m5, k2k3 paddsw m14, m1 mova m1, m5 mova m5, m9 punpckhbw m2, m3 ;G H mova m9, m2 pmaddubsw m2, k6k7 paddsw m15, m2 paddsw m14, m15 paddsw m14, krd psraw m14, 7 packuswb m13, m14 %ifidn %1, v8_avg pavgb m13, [dstq] %endif mova [dstq], m13 ; next iter pmaddubsw m15, tmp0, k0k1 pmaddubsw m14, m10, k4k5 pmaddubsw m13, m6, k2k3 paddsw m15, m14 mova tmp0, m6 mova m6, m10 movu m2, [srcq] ;G next iter punpcklbw m14, m3, m2 ;G H next iter mova m10, m14 pmaddubsw m14, k6k7 paddsw m13, m14 paddsw m15, m13 paddsw m15, krd psraw m15, 7 pmaddubsw m14, tmp1, k0k1 mova tmp1, m7 pmaddubsw m13, m7, k2k3 mova m7, m11 pmaddubsw m11, k4k5 paddsw m14, m11 punpckhbw m3, m2 ;G H next iter mova m11, m3 pmaddubsw m3, k6k7 paddsw m13, m3 paddsw m14, m13 paddsw m14, krd psraw m14, 7 packuswb m15, m14 %ifidn %1, v8_avg pavgb m15, [dstq + dstrideq] %endif mova [dstq + dstrideq], m15 lea dstq, [dstq + dstrideq * 2] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m3, [srcq + sstrideq] ;H punpcklbw m6, m2, m3 ;G H punpckhbw m2, m3 ;G H pmaddubsw m0, k0k1 pmaddubsw m1, k0k1 pmaddubsw m4, k2k3 pmaddubsw m5, k2k3 pmaddubsw m8, k4k5 pmaddubsw m9, k4k5 pmaddubsw m6, k6k7 pmaddubsw m2, k6k7 paddsw m0, m8 paddsw m1, m9 paddsw m4, m6 paddsw m5, m2 paddsw m0, m4 paddsw m1, m5 paddsw m0, krd paddsw m1, krd psraw m0, 7 psraw m1, 7 packuswb m0, m1 %ifidn %1, v8_avg pavgb m0, [dstq] %endif mova [dstq], m0 .done: REP_RET %endif ; ARCH_X86_64 %endm INIT_XMM ssse3 SUBPIX_VFILTER16 v8 SUBPIX_VFILTER v8, 8 SUBPIX_VFILTER v8, 4