diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm')
-rw-r--r-- | third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 870 |
1 files changed, 0 insertions, 870 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm deleted file mode 100644 index 3ca7921b6..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ /dev/null @@ -1,870 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_64: times 8 dw 64 -even_byte_mask: times 8 dw 0x00ff - -; %define USE_PMULHRSW -; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss -; when using this instruction. -; -; The add order below (based on ffav1) must be followed to prevent outranges. -; x = k0k1 + k4k5 -; y = k2k3 + k6k7 -; z = signed SAT(x + y) - -SECTION .text -%define LOCAL_VARS_SIZE 16*6 - -%macro SETUP_LOCAL_VARS 0 - ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + - ; pmaddubsw has a higher latency on some platforms, this might be eased by - ; interleaving the instructions. - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - packsswb m4, m4 - ; TODO(slavarnway): multiple pshufb instructions had a higher latency on - ; some platforms. - pshuflw m0, m4, 0b ;k0_k1 - pshuflw m1, m4, 01010101b ;k2_k3 - pshuflw m2, m4, 10101010b ;k4_k5 - pshuflw m3, m4, 11111111b ;k6_k7 - punpcklqdq m0, m0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - mova k0k1, m0 - mova k2k3, m1 - mova k4k5, m2 - mova k6k7, m3 -%if ARCH_X86_64 - %define krd m12 - %define tmp0 [rsp + 16*4] - %define tmp1 [rsp + 16*5] - mova krd, [GLOBAL(pw_64)] -%else - %define krd [rsp + 16*4] -%if CONFIG_PIC=0 - mova m6, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m6, m6 ;all ones - psrlw m6, 15 - psllw m6, 6 ;aka pw_64 -%endif - mova krd, m6 -%endif -%endm - -;------------------------------------------------------------------------------- -%if ARCH_X86_64 - %define LOCAL_VARS_SIZE_H4 0 -%else - %define LOCAL_VARS_SIZE_H4 16*4 -%endif - -%macro SUBPIX_HFILTER4 1 -cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - packsswb m4, m4 -%if ARCH_X86_64 - %define k0k1k4k5 m8 - %define k2k3k6k7 m9 - %define krd m10 - mova krd, [GLOBAL(pw_64)] - pshuflw k0k1k4k5, m4, 0b ;k0_k1 - pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 - pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 - pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 -%else - %define k0k1k4k5 [rsp + 16*0] - %define k2k3k6k7 [rsp + 16*1] - %define krd [rsp + 16*2] - pshuflw m6, m4, 0b ;k0_k1 - pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 - pshuflw m7, m4, 01010101b ;k2_k3 - pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 -%if CONFIG_PIC=0 - mova m1, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m1, m1 ;all ones - psrlw m1, 15 - psllw m1, 6 ;aka pw_64 -%endif - mova k0k1k4k5, m6 - mova k2k3k6k7, m7 - mova krd, m1 -%endif - dec heightd - -.loop: - ;Do two rows at once - movu m4, [srcq - 3] - movu m5, [srcq + sstrideq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - punpckhbw m3, m5, m5 - punpcklbw m5, m5 - palignr m0, m1, m4, 1 - pmaddubsw m0, k0k1k4k5 - palignr m1, m4, 5 - pmaddubsw m1, k2k3k6k7 - palignr m2, m3, m5, 1 - pmaddubsw m2, k0k1k4k5 - palignr m3, m5, 5 - pmaddubsw m3, k2k3k6k7 - punpckhqdq m4, m0, m2 - punpcklqdq m0, m2 - punpckhqdq m5, m1, m3 - punpcklqdq m1, m3 - paddsw m0, m4 - paddsw m1, m5 -%ifidn %1, h8_avg - movd m4, [dstq] - movd m5, [dstq + dstrideq] -%endif - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - movu m5, [srcq + sstrideq] - punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 - punpcklbw m4, m3 - paddsw m0, m4 -%endif - packuswb m0, m0 - psrldq m1, m0, 4 - -%ifidn %1, h8_avg - pavgb m0, m4 - pavgb m1, m5 -%endif - movd [dstq], m0 - movd [dstq + dstrideq], m1 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m4, [srcq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - palignr m0, m1, m4, 1 - palignr m1, m4, 5 - pmaddubsw m0, k0k1k4k5 - pmaddubsw m1, k2k3k6k7 - psrldq m2, m0, 8 - psrldq m3, m1, 8 - paddsw m0, m2 - paddsw m1, m3 - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - punpcklbw m4, m3 - paddsw m0, m4 -%endif - packuswb m0, m0 -%ifidn %1, h8_avg - movd m4, [dstq] - pavgb m0, m4 -%endif - movd [dstq], m0 -.done: - REP_RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER8 1 -cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - dec heightd - -.loop: - ;Do two rows at once - movu m0, [srcq - 3] - movu m4, [srcq + sstrideq - 3] - punpckhbw m1, m0, m0 - punpcklbw m0, m0 - palignr m5, m1, m0, 13 - pmaddubsw m5, k6k7 - palignr m2, m1, m0, 5 - palignr m3, m1, m0, 9 - palignr m1, m0, 1 - pmaddubsw m1, k0k1 - punpckhbw m6, m4, m4 - punpcklbw m4, m4 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - - palignr m7, m6, m4, 13 - palignr m0, m6, m4, 5 - pmaddubsw m7, k6k7 - paddsw m1, m3 - paddsw m2, m5 - paddsw m1, m2 -%ifidn %1, h8_avg - movh m2, [dstq] - movhps m2, [dstq + dstrideq] -%endif - palignr m5, m6, m4, 9 - palignr m6, m4, 1 - pmaddubsw m0, k2k3 - pmaddubsw m6, k0k1 - paddsw m1, krd - pmaddubsw m5, k4k5 - psraw m1, 7 - paddsw m0, m7 - paddsw m6, m5 - paddsw m6, m0 - paddsw m6, krd - psraw m6, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - movu m5, [srcq + sstrideq] - punpcklbw m4, m3 - punpcklbw m5, m3 - paddsw m1, m4 - paddsw m6, m5 -%endif - packuswb m1, m6 -%ifidn %1, h8_avg - pavgb m1, m2 -%endif - movh [dstq], m1 - movhps [dstq + dstrideq], m1 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m0, [srcq - 3] - punpckhbw m3, m0, m0 - punpcklbw m0, m0 - palignr m1, m3, m0, 1 - palignr m2, m3, m0, 5 - palignr m4, m3, m0, 13 - palignr m3, m0, 9 - pmaddubsw m1, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - pmaddubsw m4, k6k7 - paddsw m1, m3 - paddsw m4, m2 - paddsw m1, m4 - paddsw m1, krd - psraw m1, 7 -%ifidn %1, h8_add_src - pxor m6, m6 - movu m5, [srcq] - punpcklbw m5, m6 - paddsw m1, m5 -%endif - packuswb m1, m1 -%ifidn %1, h8_avg - movh m0, [dstq] - pavgb m1, m0 -%endif - movh [dstq], m1 -.done: - REP_RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER16 1 -cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -.loop: - prefetcht0 [srcq + 2 * sstrideq -3] - - movu m0, [srcq - 3] - movu m4, [srcq - 2] - pmaddubsw m0, k0k1 - pmaddubsw m4, k0k1 - movu m1, [srcq - 1] - movu m5, [srcq + 0] - pmaddubsw m1, k2k3 - pmaddubsw m5, k2k3 - movu m2, [srcq + 1] - movu m6, [srcq + 2] - pmaddubsw m2, k4k5 - pmaddubsw m6, k4k5 - movu m3, [srcq + 3] - movu m7, [srcq + 4] - pmaddubsw m3, k6k7 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m1, m3 - paddsw m0, m1 - paddsw m4, m6 - paddsw m5, m7 - paddsw m4, m5 - paddsw m0, krd - paddsw m4, krd - psraw m0, 7 - psraw m4, 7 -%ifidn %1, h8_add_src -%if ARCH_X86=1 && CONFIG_PIC=1 - pcmpeqb m2, m2 ;all ones - psrlw m2, 8 ;even_byte_mask -%else - mova m2, [GLOBAL(even_byte_mask)] -%endif - movu m5, [srcq] - mova m7, m5 - pand m5, m2 - psrlw m7, 8 - paddsw m0, m5 - paddsw m4, m7 -%endif - packuswb m0, m0 - packuswb m4, m4 - punpcklbw m0, m4 -%ifidn %1, h8_avg - pavgb m0, [dstq] -%endif - lea srcq, [srcq + sstrideq] - mova [dstq], m0 - lea dstq, [dstq + dstrideq] - dec heightd - jnz .loop - REP_RET -%endm - -INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER4 h8 - -;------------------------------------------------------------------------------- - -; TODO(Linfeng): Detect cpu type and choose the code with better performance. -%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 - -%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - %define NUM_GENERAL_REG_USED 9 -%else - %define NUM_GENERAL_REG_USED 6 -%endif - -%macro SUBPIX_VFILTER 2 -cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -%ifidn %2, 8 - %define movx movh -%else - %define movx movd -%endif - - dec heightd - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - mov src1q, srcq - add src1q, sstrideq - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -.loop: - ;Do two rows at once - movx m0, [srcq ] ;A - movx m1, [src1q ] ;B - punpcklbw m0, m1 ;A B - movx m2, [srcq + sstrideq * 2 ] ;C - pmaddubsw m0, k0k1 - mova m6, m2 - movx m3, [src1q + sstrideq * 2] ;D - punpcklbw m2, m3 ;C D - pmaddubsw m2, k2k3 - movx m4, [srcq + sstrideq * 4 ] ;E - mova m7, m4 - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m4, k4k5 - punpcklbw m1, m6 ;A B next iter - movx m6, [srcq + sstride6q ] ;G - punpcklbw m5, m6 ;E F next iter - punpcklbw m3, m7 ;C D next iter - pmaddubsw m5, k4k5 - movx m7, [src1q + sstride6q ] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m6, k6k7 - pmaddubsw m3, k2k3 - pmaddubsw m1, k0k1 - paddsw m0, m4 - paddsw m2, m6 - movx m6, [srcq + sstrideq * 8 ] ;H next iter - punpcklbw m7, m6 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - paddsw m1, m5 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [srcq] - punpcklbw m4, m6 - paddsw m0, m4 -%endif - packuswb m0, m0 - - paddsw m3, m7 - paddsw m1, m3 - paddsw m1, krd - psraw m1, 7 -%ifidn %1, v8_add_src - movu m4, [src1q] - punpcklbw m4, m6 - paddsw m1, m4 -%endif - lea srcq, [srcq + sstrideq * 2 ] - lea src1q, [src1q + sstrideq * 2] - packuswb m1, m1 - -%ifidn %1, v8_avg - movx m2, [dstq] - pavgb m0, m2 -%endif - movx [dstq], m0 - add dstq, dst_stride -%ifidn %1, v8_avg - movx m3, [dstq] - pavgb m1, m3 -%endif - movx [dstq], m1 - add dstq, dst_stride - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - movx m6, [srcq + sstride6q ] ;G - punpcklbw m0, m1 ;A B - movx m7, [src1q + sstride6q ] ;H - pmaddubsw m0, k0k1 - movx m2, [srcq + sstrideq * 2 ] ;C - punpcklbw m6, m7 ;G H - movx m3, [src1q + sstrideq * 2] ;D - pmaddubsw m6, k6k7 - movx m4, [srcq + sstrideq * 4 ] ;E - punpcklbw m2, m3 ;C D - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - paddsw m2, m6 - paddsw m0, m4 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [srcq] - punpcklbw m4, m6 - paddsw m0, m4 -%endif - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%else - ; ARCH_X86_64 - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2 ] - movx m2, [srcq] ;C - movx m3, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2 ] - movx m4, [srcq] ;E - movx m5, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2 ] - movx m6, [srcq] ;G - punpcklbw m0, m1 ;A B - punpcklbw m1, m2 ;A B next iter - punpcklbw m2, m3 ;C D - punpcklbw m3, m4 ;C D next iter - punpcklbw m4, m5 ;E F - punpcklbw m5, m6 ;E F next iter - -.loop: - ;Do two rows at once - movx m7, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2 ] - movx m14, [srcq] ;H next iter - punpcklbw m6, m7 ;G H - punpcklbw m7, m14 ;G H next iter - pmaddubsw m8, m0, k0k1 - pmaddubsw m9, m1, k0k1 - mova m0, m2 - mova m1, m3 - pmaddubsw m10, m2, k2k3 - pmaddubsw m11, m3, k2k3 - mova m2, m4 - mova m3, m5 - pmaddubsw m4, k4k5 - pmaddubsw m5, k4k5 - paddsw m8, m4 - paddsw m9, m5 - mova m4, m6 - mova m5, m7 - pmaddubsw m6, k6k7 - pmaddubsw m7, k6k7 - paddsw m10, m6 - paddsw m11, m7 - paddsw m8, m10 - paddsw m9, m11 - mova m6, m14 - paddsw m8, krd - paddsw m9, krd - psraw m8, 7 - psraw m9, 7 -%ifidn %2, 4 - packuswb m8, m8 - packuswb m9, m9 -%else - packuswb m8, m9 -%endif - -%ifidn %1, v8_avg - movx m7, [dstq] -%ifidn %2, 4 - movx m10, [dstq + dstrideq] - pavgb m9, m10 -%else - movhpd m7, [dstq + dstrideq] -%endif - pavgb m8, m7 -%endif - movx [dstq], m8 -%ifidn %2, 4 - movx [dstq + dstrideq], m9 -%else - movhpd [dstq + dstrideq], m8 -%endif - - lea dstq, [dstq + dstrideq * 2 ] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m7, [srcq + sstrideq] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - pmaddubsw m6, k6k7 - paddsw m0, m4 - paddsw m2, m6 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%endif ; ARCH_X86_64 - -.done: - REP_RET - -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_VFILTER16 1 -cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - lea src1q, [srcq + sstrideq] - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -.loop: - movh m0, [srcq ] ;A - movh m1, [src1q ] ;B - movh m2, [srcq + sstrideq * 2 ] ;C - movh m3, [src1q + sstrideq * 2] ;D - movh m4, [srcq + sstrideq * 4 ] ;E - movh m5, [src1q + sstrideq * 4] ;F - - punpcklbw m0, m1 ;A B - movh m6, [srcq + sstride6q] ;G - punpcklbw m2, m3 ;C D - movh m7, [src1q + sstride6q] ;H - punpcklbw m4, m5 ;E F - pmaddubsw m0, k0k1 - movh m3, [srcq + 8] ;A - pmaddubsw m2, k2k3 - punpcklbw m6, m7 ;G H - movh m5, [srcq + sstrideq + 8] ;B - pmaddubsw m4, k4k5 - punpcklbw m3, m5 ;A B - movh m7, [srcq + sstrideq * 2 + 8] ;C - pmaddubsw m6, k6k7 - movh m5, [src1q + sstrideq * 2 + 8] ;D - punpcklbw m7, m5 ;C D - paddsw m2, m6 - pmaddubsw m3, k0k1 - movh m1, [srcq + sstrideq * 4 + 8] ;E - paddsw m0, m4 - pmaddubsw m7, k2k3 - movh m6, [src1q + sstrideq * 4 + 8] ;F - punpcklbw m1, m6 ;E F - paddsw m0, m2 - paddsw m0, krd - movh m2, [srcq + sstride6q + 8] ;G - pmaddubsw m1, k4k5 - movh m5, [src1q + sstride6q + 8] ;H - psraw m0, 7 - punpcklbw m2, m5 ;G H - pmaddubsw m2, k6k7 - paddsw m7, m2 - paddsw m3, m1 - paddsw m3, m7 - paddsw m3, krd - psraw m3, 7 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down - mova m5, m4 - punpcklbw m4, m6 - punpckhbw m5, m6 - paddsw m0, m4 - paddsw m3, m5 -%endif - packuswb m0, m3 - - add srcq, sstrideq - add src1q, sstrideq -%ifidn %1, v8_avg - pavgb m0, [dstq] -%endif - mova [dstq], m0 - add dstq, dst_stride - dec heightd - jnz .loop - REP_RET - -%else - ; ARCH_X86_64 - dec heightd - - movu m1, [srcq ] ;A - movu m3, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2] - punpcklbw m0, m1, m3 ;A B - punpckhbw m1, m3 ;A B - movu m5, [srcq] ;C - punpcklbw m2, m3, m5 ;A B next iter - punpckhbw m3, m5 ;A B next iter - mova tmp0, m2 ;store to stack - mova tmp1, m3 ;store to stack - movu m7, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2] - punpcklbw m4, m5, m7 ;C D - punpckhbw m5, m7 ;C D - movu m9, [srcq] ;E - punpcklbw m6, m7, m9 ;C D next iter - punpckhbw m7, m9 ;C D next iter - movu m11, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2] - punpcklbw m8, m9, m11 ;E F - punpckhbw m9, m11 ;E F - movu m2, [srcq] ;G - punpcklbw m10, m11, m2 ;E F next iter - punpckhbw m11, m2 ;E F next iter - -.loop: - ;Do two rows at once - pmaddubsw m13, m0, k0k1 - mova m0, m4 - pmaddubsw m14, m8, k4k5 - pmaddubsw m15, m4, k2k3 - mova m4, m8 - paddsw m13, m14 - movu m3, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2] - punpcklbw m14, m2, m3 ;G H - mova m8, m14 - pmaddubsw m14, k6k7 - paddsw m15, m14 - paddsw m13, m15 - paddsw m13, krd - psraw m13, 7 - - pmaddubsw m14, m1, k0k1 - pmaddubsw m1, m9, k4k5 - pmaddubsw m15, m5, k2k3 - paddsw m14, m1 - mova m1, m5 - mova m5, m9 - punpckhbw m2, m3 ;G H - mova m9, m2 - pmaddubsw m2, k6k7 - paddsw m15, m2 - paddsw m14, m15 - paddsw m14, krd - psraw m14, 7 - packuswb m13, m14 -%ifidn %1, v8_avg - pavgb m13, [dstq] -%endif - mova [dstq], m13 - - ; next iter - pmaddubsw m15, tmp0, k0k1 - pmaddubsw m14, m10, k4k5 - pmaddubsw m13, m6, k2k3 - paddsw m15, m14 - mova tmp0, m6 - mova m6, m10 - movu m2, [srcq] ;G next iter - punpcklbw m14, m3, m2 ;G H next iter - mova m10, m14 - pmaddubsw m14, k6k7 - paddsw m13, m14 - paddsw m15, m13 - paddsw m15, krd - psraw m15, 7 - - pmaddubsw m14, tmp1, k0k1 - mova tmp1, m7 - pmaddubsw m13, m7, k2k3 - mova m7, m11 - pmaddubsw m11, k4k5 - paddsw m14, m11 - punpckhbw m3, m2 ;G H next iter - mova m11, m3 - pmaddubsw m3, k6k7 - paddsw m13, m3 - paddsw m14, m13 - paddsw m14, krd - psraw m14, 7 - packuswb m15, m14 -%ifidn %1, v8_avg - pavgb m15, [dstq + dstrideq] -%endif - mova [dstq + dstrideq], m15 - lea dstq, [dstq + dstrideq * 2] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m3, [srcq + sstrideq] ;H - punpcklbw m6, m2, m3 ;G H - punpckhbw m2, m3 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m1, k0k1 - pmaddubsw m4, k2k3 - pmaddubsw m5, k2k3 - pmaddubsw m8, k4k5 - pmaddubsw m9, k4k5 - pmaddubsw m6, k6k7 - pmaddubsw m2, k6k7 - paddsw m0, m8 - paddsw m1, m9 - paddsw m4, m6 - paddsw m5, m2 - paddsw m0, m4 - paddsw m1, m5 - paddsw m0, krd - paddsw m1, krd - psraw m0, 7 - psraw m1, 7 - packuswb m0, m1 -%ifidn %1, v8_avg - pavgb m0, [dstq] -%endif - mova [dstq], m0 - -.done: - REP_RET - -%endif ; ARCH_X86_64 - -%endm - -INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8, 4 |