summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavutil/x86
diff options
context:
space:
mode:
authortrav90 <travawine@protonmail.ch>2018-04-26 16:49:15 -0500
committertrav90 <travawine@protonmail.ch>2018-04-26 16:49:15 -0500
commit8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298 (patch)
treec37da5241afd7cda6f92d394e14cef7d5dcbb4e5 /media/ffvpx/libavutil/x86
parent56a2df6b25bc93ea9a59b8e0bf8029f752f68573 (diff)
downloadUXP-8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298.tar
UXP-8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298.tar.gz
UXP-8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298.tar.lz
UXP-8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298.tar.xz
UXP-8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298.zip
[ffvpx] Update ffvp9/ffvp8 to 3.4.2-release
Structure of code was slightly modified so that it should be no longer necessary to re-generate the config_*.h files, greatly simplifying the resync process in the future.
Diffstat (limited to 'media/ffvpx/libavutil/x86')
-rw-r--r--media/ffvpx/libavutil/x86/cpu.c37
-rw-r--r--media/ffvpx/libavutil/x86/cpu.h6
-rw-r--r--media/ffvpx/libavutil/x86/emms.asm4
-rw-r--r--media/ffvpx/libavutil/x86/emms.h4
-rw-r--r--media/ffvpx/libavutil/x86/float_dsp.asm86
-rw-r--r--media/ffvpx/libavutil/x86/float_dsp_init.c17
-rw-r--r--media/ffvpx/libavutil/x86/imgutils.asm53
-rw-r--r--media/ffvpx/libavutil/x86/imgutils_init.c49
-rw-r--r--media/ffvpx/libavutil/x86/moz.build2
-rw-r--r--media/ffvpx/libavutil/x86/x86inc.asm94
-rw-r--r--media/ffvpx/libavutil/x86/x86util.asm161
11 files changed, 449 insertions, 64 deletions
diff --git a/media/ffvpx/libavutil/x86/cpu.c b/media/ffvpx/libavutil/x86/cpu.c
index f3a49c677..f33088c8c 100644
--- a/media/ffvpx/libavutil/x86/cpu.c
+++ b/media/ffvpx/libavutil/x86/cpu.c
@@ -28,7 +28,7 @@
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
#define cpuid(index, eax, ebx, ecx, edx) \
ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
@@ -66,7 +66,7 @@
#define cpuid_test() 1
-#elif HAVE_YASM
+#elif HAVE_X86ASM
#define cpuid_test ff_cpu_cpuid_test
@@ -221,9 +221,42 @@ int ff_get_cpu_flags_x86(void)
* functions on the Atom. */
if (family == 6 && model == 28)
rval |= AV_CPU_FLAG_ATOM;
+
+ /* Conroe has a slow shuffle unit. Check the model number to ensure not
+ * to include crippled low-end Penryns and Nehalems that lack SSE4. */
+ if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) &&
+ family == 6 && model < 23)
+ rval |= AV_CPU_FLAG_SSSE3SLOW;
}
#endif /* cpuid */
return rval;
}
+
+size_t ff_get_cpu_max_align_x86(void)
+{
+ int flags = av_get_cpu_flags();
+
+ if (flags & (AV_CPU_FLAG_AVX2 |
+ AV_CPU_FLAG_AVX |
+ AV_CPU_FLAG_XOP |
+ AV_CPU_FLAG_FMA4 |
+ AV_CPU_FLAG_FMA3 |
+ AV_CPU_FLAG_AVXSLOW))
+ return 32;
+ if (flags & (AV_CPU_FLAG_AESNI |
+ AV_CPU_FLAG_SSE42 |
+ AV_CPU_FLAG_SSE4 |
+ AV_CPU_FLAG_SSSE3 |
+ AV_CPU_FLAG_SSE3 |
+ AV_CPU_FLAG_SSE2 |
+ AV_CPU_FLAG_SSE |
+ AV_CPU_FLAG_ATOM |
+ AV_CPU_FLAG_SSSE3SLOW |
+ AV_CPU_FLAG_SSE3SLOW |
+ AV_CPU_FLAG_SSE2SLOW))
+ return 16;
+
+ return 8;
+}
diff --git a/media/ffvpx/libavutil/x86/cpu.h b/media/ffvpx/libavutil/x86/cpu.h
index f171037f1..309b8e746 100644
--- a/media/ffvpx/libavutil/x86/cpu.h
+++ b/media/ffvpx/libavutil/x86/cpu.h
@@ -38,6 +38,8 @@
#define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3)
#define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3)
#define X86_SSSE3(flags) CPUEXT(flags, SSSE3)
+#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3)
+#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3)
#define X86_SSE4(flags) CPUEXT(flags, SSE4)
#define X86_SSE42(flags) CPUEXT(flags, SSE42)
#define X86_AVX(flags) CPUEXT(flags, AVX)
@@ -61,6 +63,8 @@
#define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3)
+#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3)
+#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4)
#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42)
#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX)
@@ -88,6 +92,8 @@
#define INLINE_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3)
#define INLINE_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3)
#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3)
+#define INLINE_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3)
+#define INLINE_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3)
#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4)
#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42)
#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX)
diff --git a/media/ffvpx/libavutil/x86/emms.asm b/media/ffvpx/libavutil/x86/emms.asm
index 0aad34af3..8611762d7 100644
--- a/media/ffvpx/libavutil/x86/emms.asm
+++ b/media/ffvpx/libavutil/x86/emms.asm
@@ -23,8 +23,8 @@
SECTION .text
;-----------------------------------------------------------------------------
-; void avpriv_emms_yasm(void)
+; void avpriv_emms_asm(void)
;-----------------------------------------------------------------------------
-cvisible emms_yasm, 0, 0
+cvisible emms_asm, 0, 0
emms
RET
diff --git a/media/ffvpx/libavutil/x86/emms.h b/media/ffvpx/libavutil/x86/emms.h
index 42c18e295..c21e34b45 100644
--- a/media/ffvpx/libavutil/x86/emms.h
+++ b/media/ffvpx/libavutil/x86/emms.h
@@ -23,7 +23,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-void avpriv_emms_yasm(void);
+void avpriv_emms_asm(void);
#if HAVE_MMX_INLINE
# define emms_c emms_c
@@ -49,7 +49,7 @@ static av_always_inline void emms_c(void)
# include <mmintrin.h>
# define emms_c _mm_empty
#elif HAVE_MMX_EXTERNAL
-# define emms_c avpriv_emms_yasm
+# define emms_c avpriv_emms_asm
#endif /* HAVE_MMX_INLINE */
#endif /* AVUTIL_X86_EMMS_H */
diff --git a/media/ffvpx/libavutil/x86/float_dsp.asm b/media/ffvpx/libavutil/x86/float_dsp.asm
index 021ff03c8..06d2d2cfd 100644
--- a/media/ffvpx/libavutil/x86/float_dsp.asm
+++ b/media/ffvpx/libavutil/x86/float_dsp.asm
@@ -22,6 +22,9 @@
%include "x86util.asm"
+SECTION_RODATA 32
+pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -149,6 +152,69 @@ INIT_XMM sse
VECTOR_FMUL_SCALAR
;------------------------------------------------------------------------------
+; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
+; int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_DMAC_SCALAR 0
+%if ARCH_X86_32
+cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
+ mov lenq, lenaddrm
+ VBROADCASTSD m0, mulm
+%else
+%if UNIX64
+cglobal vector_dmac_scalar, 3,3,5, dst, src, len
+%else
+cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
+ SWAP 0, 2
+%endif
+ movlhps xm0, xm0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xm0, 1
+%endif
+%endif
+ lea lenq, [lend*8-mmsize*4]
+.loop:
+%if cpuflag(fma3)
+ movaps m1, [dstq+lenq]
+ movaps m2, [dstq+lenq+1*mmsize]
+ movaps m3, [dstq+lenq+2*mmsize]
+ movaps m4, [dstq+lenq+3*mmsize]
+ fmaddpd m1, m0, [srcq+lenq], m1
+ fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
+ fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
+ fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
+%else ; cpuflag
+ mulpd m1, m0, [srcq+lenq]
+ mulpd m2, m0, [srcq+lenq+1*mmsize]
+ mulpd m3, m0, [srcq+lenq+2*mmsize]
+ mulpd m4, m0, [srcq+lenq+3*mmsize]
+ addpd m1, m1, [dstq+lenq]
+ addpd m2, m2, [dstq+lenq+1*mmsize]
+ addpd m3, m3, [dstq+lenq+2*mmsize]
+ addpd m4, m4, [dstq+lenq+3*mmsize]
+%endif ; cpuflag
+ movaps [dstq+lenq], m1
+ movaps [dstq+lenq+1*mmsize], m2
+ movaps [dstq+lenq+2*mmsize], m3
+ movaps [dstq+lenq+3*mmsize], m4
+ sub lenq, mmsize*4
+ jge .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_DMAC_SCALAR
+%endif
+
+;------------------------------------------------------------------------------
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
@@ -177,8 +243,8 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
.loop:
mulpd m1, m0, [srcq+lenq ]
mulpd m2, m0, [srcq+lenq+mmsize]
- mova [dstq+lenq ], m1
- mova [dstq+lenq+mmsize], m2
+ movaps [dstq+lenq ], m1
+ movaps [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
@@ -296,10 +362,16 @@ VECTOR_FMUL_ADD
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
+%if cpuflag(avx2)
+ movaps m2, [pd_reverse]
+%endif
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
-%if cpuflag(avx)
+%if cpuflag(avx2)
+ vpermps m0, m2, [src1q]
+ vpermps m1, m2, [src1q+mmsize]
+%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123
@@ -314,8 +386,8 @@ ALIGN 16
%endif
mulps m0, m0, [src0q + lenq + mmsize]
mulps m1, m1, [src0q + lenq]
- mova [dstq + lenq + mmsize], m0
- mova [dstq + lenq], m1
+ movaps [dstq + lenq + mmsize], m0
+ movaps [dstq + lenq], m1
add src1q, 2*mmsize
sub lenq, 2*mmsize
jge .loop
@@ -328,6 +400,10 @@ VECTOR_FMUL_REVERSE
INIT_YMM avx
VECTOR_FMUL_REVERSE
%endif
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VECTOR_FMUL_REVERSE
+%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
diff --git a/media/ffvpx/libavutil/x86/float_dsp_init.c b/media/ffvpx/libavutil/x86/float_dsp_init.c
index c836a78e1..122087a19 100644
--- a/media/ffvpx/libavutil/x86/float_dsp_init.c
+++ b/media/ffvpx/libavutil/x86/float_dsp_init.c
@@ -39,6 +39,13 @@ void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
+void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul,
+ int len);
+void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul,
+ int len);
+void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul,
+ int len);
+
void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
@@ -60,10 +67,12 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len);
+void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
+ const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
-void ff_butterflies_float_sse(float *src0, float *src1, int len);
+void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
@@ -83,17 +92,23 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->butterflies_float = ff_butterflies_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
+ }
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
}
}
diff --git a/media/ffvpx/libavutil/x86/imgutils.asm b/media/ffvpx/libavutil/x86/imgutils.asm
new file mode 100644
index 000000000..3cca56cdc
--- /dev/null
+++ b/media/ffvpx/libavutil/x86/imgutils.asm
@@ -0,0 +1,53 @@
+;*****************************************************************************
+;* Copyright 2016 Anton Khirnov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos
+ add dstq, bwq
+ add srcq, bwq
+ neg bwq
+
+.row_start:
+ mov rowposq, bwq
+
+.loop:
+ movntdqa m0, [srcq + rowposq + 0 * mmsize]
+ movntdqa m1, [srcq + rowposq + 1 * mmsize]
+ movntdqa m2, [srcq + rowposq + 2 * mmsize]
+ movntdqa m3, [srcq + rowposq + 3 * mmsize]
+
+ mova [dstq + rowposq + 0 * mmsize], m0
+ mova [dstq + rowposq + 1 * mmsize], m1
+ mova [dstq + rowposq + 2 * mmsize], m2
+ mova [dstq + rowposq + 3 * mmsize], m3
+
+ add rowposq, 4 * mmsize
+ jnz .loop
+
+ add srcq, src_linesizeq
+ add dstq, dst_linesizeq
+ dec heightd
+ jnz .row_start
+
+ RET
diff --git a/media/ffvpx/libavutil/x86/imgutils_init.c b/media/ffvpx/libavutil/x86/imgutils_init.c
new file mode 100644
index 000000000..4ea398205
--- /dev/null
+++ b/media/ffvpx/libavutil/x86/imgutils_init.c
@@ -0,0 +1,49 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/cpu.h"
+#include "libavutil/error.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/imgutils_internal.h"
+#include "libavutil/internal.h"
+
+#include "cpu.h"
+
+void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize,
+ const uint8_t *src, ptrdiff_t src_linesize,
+ ptrdiff_t bytewidth, int height);
+
+int ff_image_copy_plane_uc_from_x86(uint8_t *dst, ptrdiff_t dst_linesize,
+ const uint8_t *src, ptrdiff_t src_linesize,
+ ptrdiff_t bytewidth, int height)
+{
+ int cpu_flags = av_get_cpu_flags();
+ ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64);
+
+ if (EXTERNAL_SSE4(cpu_flags) &&
+ bw_aligned <= dst_linesize && bw_aligned <= src_linesize)
+ ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize,
+ bw_aligned, height);
+ else
+ return AVERROR(ENOSYS);
+
+ return 0;
+}
diff --git a/media/ffvpx/libavutil/x86/moz.build b/media/ffvpx/libavutil/x86/moz.build
index 1d1a6ca67..b56ed75ea 100644
--- a/media/ffvpx/libavutil/x86/moz.build
+++ b/media/ffvpx/libavutil/x86/moz.build
@@ -12,6 +12,8 @@ SOURCES += [
'fixed_dsp_init.c',
'float_dsp.asm',
'float_dsp_init.c',
+ 'imgutils.asm',
+ 'imgutils_init.c',
'lls.asm',
'lls_init.c'
]
diff --git a/media/ffvpx/libavutil/x86/x86inc.asm b/media/ffvpx/libavutil/x86/x86inc.asm
index b2e9c6019..6a054a3e0 100644
--- a/media/ffvpx/libavutil/x86/x86inc.asm
+++ b/media/ffvpx/libavutil/x86/x86inc.asm
@@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2017 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
@@ -87,7 +87,9 @@
; keep supporting OS/2.
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,aout
- section .text
+ SECTION .text
+ %elifidn __OUTPUT_FORMAT__,coff
+ SECTION .text
%else
SECTION .rodata align=%1
%endif
@@ -385,7 +387,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%ifnum %1
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0
+ ; Reserve an additional register for storing the original stack pointer, but avoid using
+ ; eax/rax for this purpose since it can potentially get overwritten as a return value.
%assign regs_used (regs_used + 1)
+ %if ARCH_X86_64 && regs_used == 7
+ %assign regs_used 8
+ %elif ARCH_X86_64 == 0 && regs_used == 1
+ %assign regs_used 2
+ %endif
%endif
%if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
@@ -419,10 +428,10 @@ DECLARE_REG 7, rdi, 64
DECLARE_REG 8, rsi, 72
DECLARE_REG 9, rbx, 80
DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
@@ -468,41 +477,42 @@ DECLARE_REG 14, R15, 120
WIN64_PUSH_XMM
%endmacro
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep xmm_regs_used-8
%assign %%i %%i-1
- movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+ movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
%endif
%if stack_size_padded > 0
%if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
- add %1, stack_size_padded
+ add rsp, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
- movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
- movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
-%macro WIN64_RESTORE_XMM 1
- WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+ WIN64_RESTORE_XMM_INTERNAL
%assign stack_offset (stack_offset-stack_size_padded)
+ %assign stack_size_padded 0
%assign xmm_regs_used 0
%endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%macro RET 0
- WIN64_RESTORE_XMM_INTERNAL rsp
+ WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
vzeroupper
@@ -523,10 +533,10 @@ DECLARE_REG 7, R10, 16
DECLARE_REG 8, R11, 24
DECLARE_REG 9, rbx, 32
DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
@@ -618,7 +628,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
- %macro WIN64_RESTORE_XMM 1
+ %macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
@@ -629,7 +639,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
- %if has_epilogue
+ %if has_epilogue || cpuflag(ssse3)
RET
%else
rep ret
@@ -780,25 +790,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
-%assign cpuflags_avx (1<<11)| cpuflags_sse42
-%assign cpuflags_xop (1<<12)| cpuflags_avx
-%assign cpuflags_fma4 (1<<13)| cpuflags_avx
-%assign cpuflags_fma3 (1<<14)| cpuflags_avx
-%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32 (1<<16)
-%assign cpuflags_cache64 (1<<17)
-%assign cpuflags_slowctz (1<<18)
-%assign cpuflags_lzcnt (1<<19)
-%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<21)
-%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
-%assign cpuflags_aesni (1<<24)|cpuflags_sse42
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni (1<<12)| cpuflags_sse42
+%assign cpuflags_avx (1<<13)| cpuflags_sse42
+%assign cpuflags_xop (1<<14)| cpuflags_avx
+%assign cpuflags_fma4 (1<<15)| cpuflags_avx
+%assign cpuflags_fma3 (1<<16)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
+
+%assign cpuflags_cache32 (1<<20)
+%assign cpuflags_cache64 (1<<21)
+%assign cpuflags_slowctz (1<<22)
+%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<24)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -1030,7 +1040,11 @@ INIT_XMM
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
- call_internal %1 %+ SUFFIX, %1
+ %ifid %1
+ call_internal %1 %+ SUFFIX, %1
+ %else
+ call %1
+ %endif
%endmacro
%macro call_internal 2
%xdefine %%i %2
diff --git a/media/ffvpx/libavutil/x86/x86util.asm b/media/ffvpx/libavutil/x86/x86util.asm
index 44ed750ae..e1220dfc1 100644
--- a/media/ffvpx/libavutil/x86/x86util.asm
+++ b/media/ffvpx/libavutil/x86/x86util.asm
@@ -29,6 +29,21 @@
%include "libavutil/x86/x86inc.asm"
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base + stride], [base + 2*stride], [base3], \
+ [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4]
+
+; Interleave low src0 with low src1 and store in src0,
+; interleave high src0 with high src1 and store in src1.
+; %1 - types
+; %2 - index of the register with src0
+; %3 - index of the register with src1
+; %4 - index of the register for intermediate results
+; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3
+; src1: y0 y1 y2 y3 q0 q1 q2 q3
+; output: src0: x0 y0 x1 y1 x2 y2 x3 y3
+; src1: z0 q0 z1 q1 z2 q2 z3 q3
%macro SBUTTERFLY 4
%ifidn %1, dqqq
vperm2i128 m%4, m%2, m%3, q0301
@@ -56,6 +71,12 @@
SWAP %1, %3, %2
%endmacro
+%macro SBUTTERFLYPD 3
+ movlhps m%3, m%1, m%2
+ movhlps m%2, m%2, m%1
+ SWAP %1, %3
+%endmacro
+
%macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
@@ -102,12 +123,9 @@
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS %1, %2, %5
SBUTTERFLYPS %3, %4, %5
- movlhps m%5, m%1, m%3
- movhlps m%3, m%1
- SWAP %5, %1
- movlhps m%5, m%2, m%4
- movhlps m%4, m%2
- SWAP %5, %2, %3
+ SBUTTERFLYPD %1, %3, %5
+ SBUTTERFLYPD %2, %4, %5
+ SWAP %2, %3
%endmacro
%macro TRANSPOSE8x4D 9-11
@@ -260,6 +278,21 @@
SWAP %12, %15
%endmacro
+%macro TRANSPOSE_8X8B 8
+ %if mmsize == 8
+ %error "This macro does not support mmsize == 8"
+ %endif
+ punpcklbw m%1, m%2
+ punpcklbw m%3, m%4
+ punpcklbw m%5, m%6
+ punpcklbw m%7, m%8
+ TRANSPOSE4x4W %1, %3, %5, %7, %2
+ MOVHL m%2, m%1
+ MOVHL m%4, m%3
+ MOVHL m%6, m%5
+ MOVHL m%8, m%7
+%endmacro
+
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
%macro PABSW 2
%if cpuflag(ssse3)
@@ -799,12 +832,25 @@
pmaxsd %1, %2
%endmacro
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
- vbroadcastss %1, %2
-%else ; sse
- movss %1, %2
- shufps %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vbroadcastss %1, %2
+%elif cpuflag(avx)
+ %ifnum sizeof%2 ; avx1 register
+ shufps xmm%1, xmm%2, xmm%2, q0000
+ %if sizeof%1 >= 32 ; mmsize>=32
+ vinsertf128 %1, %1, xmm%1, 1
+ %endif
+ %else ; avx1 memory
+ vbroadcastss %1, %2
+ %endif
+%else
+ %ifnum sizeof%2 ; sse register
+ shufps %1, %2, %2, q0000
+ %else ; sse memory
+ movss %1, %2
+ shufps %1, %1, 0
+ %endif
%endif
%endmacro
@@ -819,6 +865,21 @@
%endif
%endmacro
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vpbroadcastd %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+ %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+ %ifnum sizeof%2 ; sse2 register
+ pshufd %1, %2, q0000
+ %else ; sse memory
+ movd %1, %2
+ pshufd %1, %1, 0
+ %endif
+%endif
+%endmacro
+
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
@@ -871,3 +932,79 @@
psrlq %1, 8*(%2)
%endif
%endmacro
+
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+ %if sizeof%1>=32 ; avx
+ vperm2f128 %2, %1, %1, (0)*16+(1)
+ addps %1, %2
+ %endif
+ shufps %2, %1, %1, q1032
+ addps %1, %2
+ shufps %2, %1, %1, q0321
+ addps %1, %2
+%else ; this form is a bit faster than the short avx-like emulation.
+ movaps %2, %1
+ shufps %1, %1, q1032
+ addps %1, %2
+ movaps %2, %1
+ shufps %1, %1, q0321
+ addps %1, %2
+ ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ blendvps %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %ifnidn %3,xmm0
+ %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+ %endif
+ blendvps %1, %2, %3
+%else
+ xorps %2, %1
+ andps %2, %3
+ xorps %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+ %error pblendb not possible with ymm on avx1, try blendvps.
+ %endif
+ pblendvb %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %ifnidn %3,xmm0
+ %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+ %endif
+ pblendvb %1, %2, %3
+%else
+ pxor %2, %1
+ pand %2, %3
+ pxor %1, %2
+%endif
+%endmacro