summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm870
1 files changed, 0 insertions, 870 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
deleted file mode 100644
index 3ca7921b6..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,870 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64: times 8 dw 64
-even_byte_mask: times 8 dw 0x00ff
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffav1) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%define LOCAL_VARS_SIZE 16*6
-
-%macro SETUP_LOCAL_VARS 0
- ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
- ; pmaddubsw has a higher latency on some platforms, this might be eased by
- ; interleaving the instructions.
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- packsswb m4, m4
- ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
- ; some platforms.
- pshuflw m0, m4, 0b ;k0_k1
- pshuflw m1, m4, 01010101b ;k2_k3
- pshuflw m2, m4, 10101010b ;k4_k5
- pshuflw m3, m4, 11111111b ;k6_k7
- punpcklqdq m0, m0
- punpcklqdq m1, m1
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- mova k0k1, m0
- mova k2k3, m1
- mova k4k5, m2
- mova k6k7, m3
-%if ARCH_X86_64
- %define krd m12
- %define tmp0 [rsp + 16*4]
- %define tmp1 [rsp + 16*5]
- mova krd, [GLOBAL(pw_64)]
-%else
- %define krd [rsp + 16*4]
-%if CONFIG_PIC=0
- mova m6, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m6, m6 ;all ones
- psrlw m6, 15
- psllw m6, 6 ;aka pw_64
-%endif
- mova krd, m6
-%endif
-%endm
-
-;-------------------------------------------------------------------------------
-%if ARCH_X86_64
- %define LOCAL_VARS_SIZE_H4 0
-%else
- %define LOCAL_VARS_SIZE_H4 16*4
-%endif
-
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- packsswb m4, m4
-%if ARCH_X86_64
- %define k0k1k4k5 m8
- %define k2k3k6k7 m9
- %define krd m10
- mova krd, [GLOBAL(pw_64)]
- pshuflw k0k1k4k5, m4, 0b ;k0_k1
- pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
- pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
- pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
- %define k0k1k4k5 [rsp + 16*0]
- %define k2k3k6k7 [rsp + 16*1]
- %define krd [rsp + 16*2]
- pshuflw m6, m4, 0b ;k0_k1
- pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
- pshuflw m7, m4, 01010101b ;k2_k3
- pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
-%if CONFIG_PIC=0
- mova m1, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m1, m1 ;all ones
- psrlw m1, 15
- psllw m1, 6 ;aka pw_64
-%endif
- mova k0k1k4k5, m6
- mova k2k3k6k7, m7
- mova krd, m1
-%endif
- dec heightd
-
-.loop:
- ;Do two rows at once
- movu m4, [srcq - 3]
- movu m5, [srcq + sstrideq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- punpckhbw m3, m5, m5
- punpcklbw m5, m5
- palignr m0, m1, m4, 1
- pmaddubsw m0, k0k1k4k5
- palignr m1, m4, 5
- pmaddubsw m1, k2k3k6k7
- palignr m2, m3, m5, 1
- pmaddubsw m2, k0k1k4k5
- palignr m3, m5, 5
- pmaddubsw m3, k2k3k6k7
- punpckhqdq m4, m0, m2
- punpcklqdq m0, m2
- punpckhqdq m5, m1, m3
- punpcklqdq m1, m3
- paddsw m0, m4
- paddsw m1, m5
-%ifidn %1, h8_avg
- movd m4, [dstq]
- movd m5, [dstq + dstrideq]
-%endif
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- movu m5, [srcq + sstrideq]
- punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
- punpcklbw m4, m3
- paddsw m0, m4
-%endif
- packuswb m0, m0
- psrldq m1, m0, 4
-
-%ifidn %1, h8_avg
- pavgb m0, m4
- pavgb m1, m5
-%endif
- movd [dstq], m0
- movd [dstq + dstrideq], m1
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
-
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m4, [srcq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- palignr m0, m1, m4, 1
- palignr m1, m4, 5
- pmaddubsw m0, k0k1k4k5
- pmaddubsw m1, k2k3k6k7
- psrldq m2, m0, 8
- psrldq m3, m1, 8
- paddsw m0, m2
- paddsw m1, m3
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- punpcklbw m4, m3
- paddsw m0, m4
-%endif
- packuswb m0, m0
-%ifidn %1, h8_avg
- movd m4, [dstq]
- pavgb m0, m4
-%endif
- movd [dstq], m0
-.done:
- REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- dec heightd
-
-.loop:
- ;Do two rows at once
- movu m0, [srcq - 3]
- movu m4, [srcq + sstrideq - 3]
- punpckhbw m1, m0, m0
- punpcklbw m0, m0
- palignr m5, m1, m0, 13
- pmaddubsw m5, k6k7
- palignr m2, m1, m0, 5
- palignr m3, m1, m0, 9
- palignr m1, m0, 1
- pmaddubsw m1, k0k1
- punpckhbw m6, m4, m4
- punpcklbw m4, m4
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
-
- palignr m7, m6, m4, 13
- palignr m0, m6, m4, 5
- pmaddubsw m7, k6k7
- paddsw m1, m3
- paddsw m2, m5
- paddsw m1, m2
-%ifidn %1, h8_avg
- movh m2, [dstq]
- movhps m2, [dstq + dstrideq]
-%endif
- palignr m5, m6, m4, 9
- palignr m6, m4, 1
- pmaddubsw m0, k2k3
- pmaddubsw m6, k0k1
- paddsw m1, krd
- pmaddubsw m5, k4k5
- psraw m1, 7
- paddsw m0, m7
- paddsw m6, m5
- paddsw m6, m0
- paddsw m6, krd
- psraw m6, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- movu m5, [srcq + sstrideq]
- punpcklbw m4, m3
- punpcklbw m5, m3
- paddsw m1, m4
- paddsw m6, m5
-%endif
- packuswb m1, m6
-%ifidn %1, h8_avg
- pavgb m1, m2
-%endif
- movh [dstq], m1
- movhps [dstq + dstrideq], m1
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m0, [srcq - 3]
- punpckhbw m3, m0, m0
- punpcklbw m0, m0
- palignr m1, m3, m0, 1
- palignr m2, m3, m0, 5
- palignr m4, m3, m0, 13
- palignr m3, m0, 9
- pmaddubsw m1, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
- pmaddubsw m4, k6k7
- paddsw m1, m3
- paddsw m4, m2
- paddsw m1, m4
- paddsw m1, krd
- psraw m1, 7
-%ifidn %1, h8_add_src
- pxor m6, m6
- movu m5, [srcq]
- punpcklbw m5, m6
- paddsw m1, m5
-%endif
- packuswb m1, m1
-%ifidn %1, h8_avg
- movh m0, [dstq]
- pavgb m1, m0
-%endif
- movh [dstq], m1
-.done:
- REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-.loop:
- prefetcht0 [srcq + 2 * sstrideq -3]
-
- movu m0, [srcq - 3]
- movu m4, [srcq - 2]
- pmaddubsw m0, k0k1
- pmaddubsw m4, k0k1
- movu m1, [srcq - 1]
- movu m5, [srcq + 0]
- pmaddubsw m1, k2k3
- pmaddubsw m5, k2k3
- movu m2, [srcq + 1]
- movu m6, [srcq + 2]
- pmaddubsw m2, k4k5
- pmaddubsw m6, k4k5
- movu m3, [srcq + 3]
- movu m7, [srcq + 4]
- pmaddubsw m3, k6k7
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m1, m3
- paddsw m0, m1
- paddsw m4, m6
- paddsw m5, m7
- paddsw m4, m5
- paddsw m0, krd
- paddsw m4, krd
- psraw m0, 7
- psraw m4, 7
-%ifidn %1, h8_add_src
-%if ARCH_X86=1 && CONFIG_PIC=1
- pcmpeqb m2, m2 ;all ones
- psrlw m2, 8 ;even_byte_mask
-%else
- mova m2, [GLOBAL(even_byte_mask)]
-%endif
- movu m5, [srcq]
- mova m7, m5
- pand m5, m2
- psrlw m7, 8
- paddsw m0, m5
- paddsw m4, m7
-%endif
- packuswb m0, m0
- packuswb m4, m4
- punpcklbw m0, m4
-%ifidn %1, h8_avg
- pavgb m0, [dstq]
-%endif
- lea srcq, [srcq + sstrideq]
- mova [dstq], m0
- lea dstq, [dstq + dstrideq]
- dec heightd
- jnz .loop
- REP_RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER8 h8
-SUBPIX_HFILTER4 h8
-
-;-------------------------------------------------------------------------------
-
-; TODO(Linfeng): Detect cpu type and choose the code with better performance.
-%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
-
-%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
- %define NUM_GENERAL_REG_USED 9
-%else
- %define NUM_GENERAL_REG_USED 6
-%endif
-
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-%ifidn %2, 8
- %define movx movh
-%else
- %define movx movd
-%endif
-
- dec heightd
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-.loop:
- ;Do two rows at once
- movx m0, [srcq ] ;A
- movx m1, [src1q ] ;B
- punpcklbw m0, m1 ;A B
- movx m2, [srcq + sstrideq * 2 ] ;C
- pmaddubsw m0, k0k1
- mova m6, m2
- movx m3, [src1q + sstrideq * 2] ;D
- punpcklbw m2, m3 ;C D
- pmaddubsw m2, k2k3
- movx m4, [srcq + sstrideq * 4 ] ;E
- mova m7, m4
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m4, k4k5
- punpcklbw m1, m6 ;A B next iter
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m5, m6 ;E F next iter
- punpcklbw m3, m7 ;C D next iter
- pmaddubsw m5, k4k5
- movx m7, [src1q + sstride6q ] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m6, k6k7
- pmaddubsw m3, k2k3
- pmaddubsw m1, k0k1
- paddsw m0, m4
- paddsw m2, m6
- movx m6, [srcq + sstrideq * 8 ] ;H next iter
- punpcklbw m7, m6
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- paddsw m1, m5
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [srcq]
- punpcklbw m4, m6
- paddsw m0, m4
-%endif
- packuswb m0, m0
-
- paddsw m3, m7
- paddsw m1, m3
- paddsw m1, krd
- psraw m1, 7
-%ifidn %1, v8_add_src
- movu m4, [src1q]
- punpcklbw m4, m6
- paddsw m1, m4
-%endif
- lea srcq, [srcq + sstrideq * 2 ]
- lea src1q, [src1q + sstrideq * 2]
- packuswb m1, m1
-
-%ifidn %1, v8_avg
- movx m2, [dstq]
- pavgb m0, m2
-%endif
- movx [dstq], m0
- add dstq, dst_stride
-%ifidn %1, v8_avg
- movx m3, [dstq]
- pavgb m1, m3
-%endif
- movx [dstq], m1
- add dstq, dst_stride
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m0, m1 ;A B
- movx m7, [src1q + sstride6q ] ;H
- pmaddubsw m0, k0k1
- movx m2, [srcq + sstrideq * 2 ] ;C
- punpcklbw m6, m7 ;G H
- movx m3, [src1q + sstrideq * 2] ;D
- pmaddubsw m6, k6k7
- movx m4, [srcq + sstrideq * 4 ] ;E
- punpcklbw m2, m3 ;C D
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- paddsw m2, m6
- paddsw m0, m4
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [srcq]
- punpcklbw m4, m6
- paddsw m0, m4
-%endif
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-
-%else
- ; ARCH_X86_64
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2 ]
- movx m2, [srcq] ;C
- movx m3, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2 ]
- movx m4, [srcq] ;E
- movx m5, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2 ]
- movx m6, [srcq] ;G
- punpcklbw m0, m1 ;A B
- punpcklbw m1, m2 ;A B next iter
- punpcklbw m2, m3 ;C D
- punpcklbw m3, m4 ;C D next iter
- punpcklbw m4, m5 ;E F
- punpcklbw m5, m6 ;E F next iter
-
-.loop:
- ;Do two rows at once
- movx m7, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2 ]
- movx m14, [srcq] ;H next iter
- punpcklbw m6, m7 ;G H
- punpcklbw m7, m14 ;G H next iter
- pmaddubsw m8, m0, k0k1
- pmaddubsw m9, m1, k0k1
- mova m0, m2
- mova m1, m3
- pmaddubsw m10, m2, k2k3
- pmaddubsw m11, m3, k2k3
- mova m2, m4
- mova m3, m5
- pmaddubsw m4, k4k5
- pmaddubsw m5, k4k5
- paddsw m8, m4
- paddsw m9, m5
- mova m4, m6
- mova m5, m7
- pmaddubsw m6, k6k7
- pmaddubsw m7, k6k7
- paddsw m10, m6
- paddsw m11, m7
- paddsw m8, m10
- paddsw m9, m11
- mova m6, m14
- paddsw m8, krd
- paddsw m9, krd
- psraw m8, 7
- psraw m9, 7
-%ifidn %2, 4
- packuswb m8, m8
- packuswb m9, m9
-%else
- packuswb m8, m9
-%endif
-
-%ifidn %1, v8_avg
- movx m7, [dstq]
-%ifidn %2, 4
- movx m10, [dstq + dstrideq]
- pavgb m9, m10
-%else
- movhpd m7, [dstq + dstrideq]
-%endif
- pavgb m8, m7
-%endif
- movx [dstq], m8
-%ifidn %2, 4
- movx [dstq + dstrideq], m9
-%else
- movhpd [dstq + dstrideq], m8
-%endif
-
- lea dstq, [dstq + dstrideq * 2 ]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movx m7, [srcq + sstrideq] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- pmaddubsw m6, k6k7
- paddsw m0, m4
- paddsw m2, m6
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-
-%endif ; ARCH_X86_64
-
-.done:
- REP_RET
-
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- lea src1q, [srcq + sstrideq]
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-.loop:
- movh m0, [srcq ] ;A
- movh m1, [src1q ] ;B
- movh m2, [srcq + sstrideq * 2 ] ;C
- movh m3, [src1q + sstrideq * 2] ;D
- movh m4, [srcq + sstrideq * 4 ] ;E
- movh m5, [src1q + sstrideq * 4] ;F
-
- punpcklbw m0, m1 ;A B
- movh m6, [srcq + sstride6q] ;G
- punpcklbw m2, m3 ;C D
- movh m7, [src1q + sstride6q] ;H
- punpcklbw m4, m5 ;E F
- pmaddubsw m0, k0k1
- movh m3, [srcq + 8] ;A
- pmaddubsw m2, k2k3
- punpcklbw m6, m7 ;G H
- movh m5, [srcq + sstrideq + 8] ;B
- pmaddubsw m4, k4k5
- punpcklbw m3, m5 ;A B
- movh m7, [srcq + sstrideq * 2 + 8] ;C
- pmaddubsw m6, k6k7
- movh m5, [src1q + sstrideq * 2 + 8] ;D
- punpcklbw m7, m5 ;C D
- paddsw m2, m6
- pmaddubsw m3, k0k1
- movh m1, [srcq + sstrideq * 4 + 8] ;E
- paddsw m0, m4
- pmaddubsw m7, k2k3
- movh m6, [src1q + sstrideq * 4 + 8] ;F
- punpcklbw m1, m6 ;E F
- paddsw m0, m2
- paddsw m0, krd
- movh m2, [srcq + sstride6q + 8] ;G
- pmaddubsw m1, k4k5
- movh m5, [src1q + sstride6q + 8] ;H
- psraw m0, 7
- punpcklbw m2, m5 ;G H
- pmaddubsw m2, k6k7
- paddsw m7, m2
- paddsw m3, m1
- paddsw m3, m7
- paddsw m3, krd
- psraw m3, 7
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
- mova m5, m4
- punpcklbw m4, m6
- punpckhbw m5, m6
- paddsw m0, m4
- paddsw m3, m5
-%endif
- packuswb m0, m3
-
- add srcq, sstrideq
- add src1q, sstrideq
-%ifidn %1, v8_avg
- pavgb m0, [dstq]
-%endif
- mova [dstq], m0
- add dstq, dst_stride
- dec heightd
- jnz .loop
- REP_RET
-
-%else
- ; ARCH_X86_64
- dec heightd
-
- movu m1, [srcq ] ;A
- movu m3, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m0, m1, m3 ;A B
- punpckhbw m1, m3 ;A B
- movu m5, [srcq] ;C
- punpcklbw m2, m3, m5 ;A B next iter
- punpckhbw m3, m5 ;A B next iter
- mova tmp0, m2 ;store to stack
- mova tmp1, m3 ;store to stack
- movu m7, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m4, m5, m7 ;C D
- punpckhbw m5, m7 ;C D
- movu m9, [srcq] ;E
- punpcklbw m6, m7, m9 ;C D next iter
- punpckhbw m7, m9 ;C D next iter
- movu m11, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m8, m9, m11 ;E F
- punpckhbw m9, m11 ;E F
- movu m2, [srcq] ;G
- punpcklbw m10, m11, m2 ;E F next iter
- punpckhbw m11, m2 ;E F next iter
-
-.loop:
- ;Do two rows at once
- pmaddubsw m13, m0, k0k1
- mova m0, m4
- pmaddubsw m14, m8, k4k5
- pmaddubsw m15, m4, k2k3
- mova m4, m8
- paddsw m13, m14
- movu m3, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m14, m2, m3 ;G H
- mova m8, m14
- pmaddubsw m14, k6k7
- paddsw m15, m14
- paddsw m13, m15
- paddsw m13, krd
- psraw m13, 7
-
- pmaddubsw m14, m1, k0k1
- pmaddubsw m1, m9, k4k5
- pmaddubsw m15, m5, k2k3
- paddsw m14, m1
- mova m1, m5
- mova m5, m9
- punpckhbw m2, m3 ;G H
- mova m9, m2
- pmaddubsw m2, k6k7
- paddsw m15, m2
- paddsw m14, m15
- paddsw m14, krd
- psraw m14, 7
- packuswb m13, m14
-%ifidn %1, v8_avg
- pavgb m13, [dstq]
-%endif
- mova [dstq], m13
-
- ; next iter
- pmaddubsw m15, tmp0, k0k1
- pmaddubsw m14, m10, k4k5
- pmaddubsw m13, m6, k2k3
- paddsw m15, m14
- mova tmp0, m6
- mova m6, m10
- movu m2, [srcq] ;G next iter
- punpcklbw m14, m3, m2 ;G H next iter
- mova m10, m14
- pmaddubsw m14, k6k7
- paddsw m13, m14
- paddsw m15, m13
- paddsw m15, krd
- psraw m15, 7
-
- pmaddubsw m14, tmp1, k0k1
- mova tmp1, m7
- pmaddubsw m13, m7, k2k3
- mova m7, m11
- pmaddubsw m11, k4k5
- paddsw m14, m11
- punpckhbw m3, m2 ;G H next iter
- mova m11, m3
- pmaddubsw m3, k6k7
- paddsw m13, m3
- paddsw m14, m13
- paddsw m14, krd
- psraw m14, 7
- packuswb m15, m14
-%ifidn %1, v8_avg
- pavgb m15, [dstq + dstrideq]
-%endif
- mova [dstq + dstrideq], m15
- lea dstq, [dstq + dstrideq * 2]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m3, [srcq + sstrideq] ;H
- punpcklbw m6, m2, m3 ;G H
- punpckhbw m2, m3 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m1, k0k1
- pmaddubsw m4, k2k3
- pmaddubsw m5, k2k3
- pmaddubsw m8, k4k5
- pmaddubsw m9, k4k5
- pmaddubsw m6, k6k7
- pmaddubsw m2, k6k7
- paddsw m0, m8
- paddsw m1, m9
- paddsw m4, m6
- paddsw m5, m2
- paddsw m0, m4
- paddsw m1, m5
- paddsw m0, krd
- paddsw m1, krd
- psraw m0, 7
- psraw m1, 7
- packuswb m0, m1
-%ifidn %1, v8_avg
- pavgb m0, [dstq]
-%endif
- mova [dstq], m0
-
-.done:
- REP_RET
-
-%endif ; ARCH_X86_64
-
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16 v8
-SUBPIX_VFILTER v8, 8
-SUBPIX_VFILTER v8, 4