summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm')
-rw-r--r--media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm257
1 files changed, 257 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
index c0ac16d3e..32b698243 100644
--- a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -847,6 +847,108 @@ DL_FUNCS
INIT_XMM avx
DL_FUNCS
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm1, [aq+30] ; pppppppp
+ vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
+ vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
+ vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
+ LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+
+.loop:
+ mova [dstq+strideq*0], m0
+ vpalignr m3, m2, m0, 2
+ vpalignr m4, m2, m0, 4
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 6
+ vpalignr m4, m2, m0, 8
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m4
+ vpalignr m3, m2, m0, 10
+ vpalignr m4, m2, m0, 12
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 14
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova m0, m2
+ vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
+ mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
+ vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
+ vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
+ vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
+ vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
+ LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
+ vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
+ vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
+ vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
+ LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
+ vperm2i128 m2, m1, m4, q0201 ; Z......555555555
+ vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+
+.loop:
+ mova [dstq+strideq*0 + 0], m0
+ mova [dstq+strideq*0 +32], m1
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m2, m1, 2
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m2, m1, 4
+ mova [dstq+strideq*2 + 0], m3
+ mova [dstq+strideq*2 +32], m4
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m2, m1, 6
+ mova [dstq+stride3q*1+ 0], m3
+ mova [dstq+stride3q*1+32], m4
+ lea dstq, [dstq+strideq*4]
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m2, m1, 8
+ mova [dstq+strideq*0 + 0], m3
+ mova [dstq+strideq*0 +32], m4
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m2, m1, 10
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m2, m1, 12
+ mova [dstq+strideq*2+ 0], m3
+ mova [dstq+strideq*2+32], m4
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m2, m1, 14
+ mova [dstq+stride3q+ 0], m3
+ mova [dstq+stride3q+ 32], m4
+ vpalignr m3, m5, m0, 16
+ vpalignr m4, m2, m1, 16
+ vperm2i128 m5, m3, m4, q0201
+ vperm2i128 m2, m4, m4, q0101
+ mova m0, m3
+ mova m1, m4
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
movh m0, [lq] ; wxyz....
@@ -1068,6 +1170,161 @@ DR_FUNCS 2
INIT_XMM avx
DR_FUNCS 2
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+ mova m0, [lq] ; klmnopqrstuvwxyz
+ movu m1, [aq-2] ; *abcdefghijklmno
+ mova m2, [aq] ; abcdefghijklmnop
+ vperm2i128 m4, m2, m2, q2001 ; ijklmnop........
+ vpalignr m5, m4, m2, 2 ; bcdefghijklmnop.
+ vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg
+ LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO.
+ vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz*
+ vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ#
+ vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH
+ DEFINE_ARGS dst, stride, stride3, stride5, dst3
+ lea dst3q, [dstq+strideq*4]
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m1, m5, 2
+ mova [dst3q+stride5q*2], m3 ; 14
+ mova [ dstq+stride3q*2], m4 ; 6
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m1, m5, 4
+ sub dst3q, strideq
+ mova [dst3q+stride5q*2], m3 ; 13
+ mova [dst3q+strideq*2 ], m4 ; 5
+ mova [dst3q+stride3q*4], m0 ; 15
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m1, m5, 6
+ mova [dstq+stride3q*4], m3 ; 12
+ mova [dst3q+strideq*1], m4 ; 4
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m1, m5, 8
+ mova [dst3q+strideq*8], m3 ; 11
+ mova [dst3q+strideq*0], m4 ; 3
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m1, m5, 10
+ mova [dstq+stride5q*2], m3 ; 10
+ mova [dstq+strideq*2 ], m4 ; 2
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m1, m5, 12
+ mova [dst3q+stride3q*2], m3 ; 9
+ mova [dstq+strideq*1 ], m4 ; 1
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m1, m5, 14
+ mova [dstq+strideq*8], m3 ; 8
+ mova [dstq+strideq*0], m4 ; 0
+ mova [dst3q+strideq*4], m5 ; 7
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
+ mova m0, [lq+mmsize*0+0] ; l[0-15]
+ mova m1, [lq+mmsize*1+0] ; l[16-31]
+ movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno
+ mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop
+ mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345
+ vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0
+ vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01
+ vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012
+ LOWPASS 0, 6, 7 ; L[0-15]
+ vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg
+ vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz*
+ vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 1, 5, 6 ; L[16-31]#
+ vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx
+ vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq
+ LOWPASS 2, 3, 6 ; A[0-15]
+ movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234
+ vperm2i128 m6, m4, m4, q2001 ; yz012345........
+ vpalignr m7, m6, m4, 2 ; rstuvwxyz012345.
+ LOWPASS 3, 4, 7 ; A[16-31].
+ vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH
+ vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23]
+ vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX
+ DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+ lea stride7q, [strideq*4+stride3q]
+ lea dst24q, [dst8q+stride3q*8]
+ lea dst8q, [dst8q+strideq*8]
+ mov cntd, 2
+
+.loop:
+ mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7
+ mova [dst24q+stride7q+32], m1
+ mova [dst8q+stride7q+0], m1
+ mova [dst8q+stride7q+32], m2
+ vpalignr m6, m4, m1, 2
+ vpalignr m7, m5, m0, 2
+ vpalignr m9, m8, m2, 2
+ mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
+ mova [dst24q+stride3q*2+32], m6
+ mova [dst8q+stride3q*2+0], m6
+ mova [dst8q+stride3q*2+32], m9
+ vpalignr m6, m4, m1, 4
+ vpalignr m7, m5, m0, 4
+ vpalignr m9, m8, m2, 4
+ mova [dst24q+stride5q+0], m7 ; 29 21 13 5
+ mova [dst24q+stride5q+32], m6
+ mova [dst8q+stride5q+0], m6
+ mova [dst8q+stride5q+32], m9
+ vpalignr m6, m4, m1, 6
+ vpalignr m7, m5, m0, 6
+ vpalignr m9, m8, m2, 6
+ mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4
+ mova [dst24q+strideq*4+32], m6
+ mova [dst8q+strideq*4+0], m6
+ mova [dst8q+strideq*4+32], m9
+ vpalignr m6, m4, m1, 8
+ vpalignr m7, m5, m0, 8
+ vpalignr m9, m8, m2, 8
+ mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3
+ mova [dst24q+stride3q+32], m6
+ mova [dst8q+stride3q+0], m6
+ mova [dst8q+stride3q+32], m9
+ vpalignr m6, m4, m1, 10
+ vpalignr m7, m5, m0, 10
+ vpalignr m9, m8, m2, 10
+ mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2
+ mova [dst24q+strideq*2+32], m6
+ mova [dst8q+strideq*2+0], m6
+ mova [dst8q+strideq*2+32], m9
+ vpalignr m6, m4, m1, 12
+ vpalignr m7, m5, m0, 12
+ vpalignr m9, m8, m2, 12
+ mova [dst24q+strideq+0 ], m7 ; 25 17 9 1
+ mova [dst24q+strideq+32], m6
+ mova [dst8q+strideq+0], m6
+ mova [dst8q+strideq+32], m9
+ vpalignr m6, m4, m1, 14
+ vpalignr m7, m5, m0, 14
+ vpalignr m9, m8, m2, 14
+ mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0
+ mova [dst24q+strideq*0+32], m6
+ mova [dst8q+strideq*0+0], m6
+ mova [dst8q+strideq*0+32], m9
+ mova m0, m5
+ mova m5, m1
+ mova m1, m4
+ mova m4, m2
+ mova m2, m8
+ mova m8, m3
+ sub dst24q, stride7q
+ sub dst24q, strideq
+ sub dst8q, stride7q
+ sub dst8q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endif
+
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp