diff options
author | Moonchild <mcwerewolf@gmail.com> | 2018-04-27 19:15:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-27 19:15:26 +0200 |
commit | c75dae3ed21bfa5a8ae46cd83d18329af5bea05a (patch) | |
tree | ef657c38feb2368a3c86765778d1f068aa5eb614 /media/ffvpx/libavutil/x86 | |
parent | c82c6d960a7f19d6595171f9705c43514f20c1ec (diff) | |
parent | 6ada4b14e4cfc91f5f1b2556623cab691f3ab813 (diff) | |
download | UXP-c75dae3ed21bfa5a8ae46cd83d18329af5bea05a.tar UXP-c75dae3ed21bfa5a8ae46cd83d18329af5bea05a.tar.gz UXP-c75dae3ed21bfa5a8ae46cd83d18329af5bea05a.tar.lz UXP-c75dae3ed21bfa5a8ae46cd83d18329af5bea05a.tar.xz UXP-c75dae3ed21bfa5a8ae46cd83d18329af5bea05a.zip |
Merge pull request #275 from trav90/ffvpx-resync
Resync ffvpx code with 3.4.2-release from upstream
Diffstat (limited to 'media/ffvpx/libavutil/x86')
-rw-r--r-- | media/ffvpx/libavutil/x86/cpu.c | 37 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/cpu.h | 6 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/emms.asm | 4 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/emms.h | 4 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/float_dsp.asm | 86 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/float_dsp_init.c | 17 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/imgutils.asm | 53 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/imgutils_init.c | 49 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/moz.build | 2 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/x86inc.asm | 94 | ||||
-rw-r--r-- | media/ffvpx/libavutil/x86/x86util.asm | 161 |
11 files changed, 449 insertions, 64 deletions
diff --git a/media/ffvpx/libavutil/x86/cpu.c b/media/ffvpx/libavutil/x86/cpu.c index f3a49c677..f33088c8c 100644 --- a/media/ffvpx/libavutil/x86/cpu.c +++ b/media/ffvpx/libavutil/x86/cpu.c @@ -28,7 +28,7 @@ #include "libavutil/cpu.h" #include "libavutil/cpu_internal.h" -#if HAVE_YASM +#if HAVE_X86ASM #define cpuid(index, eax, ebx, ecx, edx) \ ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx) @@ -66,7 +66,7 @@ #define cpuid_test() 1 -#elif HAVE_YASM +#elif HAVE_X86ASM #define cpuid_test ff_cpu_cpuid_test @@ -221,9 +221,42 @@ int ff_get_cpu_flags_x86(void) * functions on the Atom. */ if (family == 6 && model == 28) rval |= AV_CPU_FLAG_ATOM; + + /* Conroe has a slow shuffle unit. Check the model number to ensure not + * to include crippled low-end Penryns and Nehalems that lack SSE4. */ + if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) && + family == 6 && model < 23) + rval |= AV_CPU_FLAG_SSSE3SLOW; } #endif /* cpuid */ return rval; } + +size_t ff_get_cpu_max_align_x86(void) +{ + int flags = av_get_cpu_flags(); + + if (flags & (AV_CPU_FLAG_AVX2 | + AV_CPU_FLAG_AVX | + AV_CPU_FLAG_XOP | + AV_CPU_FLAG_FMA4 | + AV_CPU_FLAG_FMA3 | + AV_CPU_FLAG_AVXSLOW)) + return 32; + if (flags & (AV_CPU_FLAG_AESNI | + AV_CPU_FLAG_SSE42 | + AV_CPU_FLAG_SSE4 | + AV_CPU_FLAG_SSSE3 | + AV_CPU_FLAG_SSE3 | + AV_CPU_FLAG_SSE2 | + AV_CPU_FLAG_SSE | + AV_CPU_FLAG_ATOM | + AV_CPU_FLAG_SSSE3SLOW | + AV_CPU_FLAG_SSE3SLOW | + AV_CPU_FLAG_SSE2SLOW)) + return 16; + + return 8; +} diff --git a/media/ffvpx/libavutil/x86/cpu.h b/media/ffvpx/libavutil/x86/cpu.h index f171037f1..309b8e746 100644 --- a/media/ffvpx/libavutil/x86/cpu.h +++ b/media/ffvpx/libavutil/x86/cpu.h @@ -38,6 +38,8 @@ #define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3) #define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3) #define X86_SSSE3(flags) CPUEXT(flags, SSSE3) +#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3) +#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3) #define X86_SSE4(flags) CPUEXT(flags, SSE4) #define X86_SSE42(flags) CPUEXT(flags, SSE42) #define X86_AVX(flags) CPUEXT(flags, AVX) @@ -61,6 +63,8 @@ #define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3) #define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3) #define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3) #define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4) #define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) #define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) @@ -88,6 +92,8 @@ #define INLINE_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3) #define INLINE_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3) #define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3) +#define INLINE_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3) +#define INLINE_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3) #define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4) #define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) #define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) diff --git a/media/ffvpx/libavutil/x86/emms.asm b/media/ffvpx/libavutil/x86/emms.asm index 0aad34af3..8611762d7 100644 --- a/media/ffvpx/libavutil/x86/emms.asm +++ b/media/ffvpx/libavutil/x86/emms.asm @@ -23,8 +23,8 @@ SECTION .text ;----------------------------------------------------------------------------- -; void avpriv_emms_yasm(void) +; void avpriv_emms_asm(void) ;----------------------------------------------------------------------------- -cvisible emms_yasm, 0, 0 +cvisible emms_asm, 0, 0 emms RET diff --git a/media/ffvpx/libavutil/x86/emms.h b/media/ffvpx/libavutil/x86/emms.h index 42c18e295..c21e34b45 100644 --- a/media/ffvpx/libavutil/x86/emms.h +++ b/media/ffvpx/libavutil/x86/emms.h @@ -23,7 +23,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -void avpriv_emms_yasm(void); +void avpriv_emms_asm(void); #if HAVE_MMX_INLINE # define emms_c emms_c @@ -49,7 +49,7 @@ static av_always_inline void emms_c(void) # include <mmintrin.h> # define emms_c _mm_empty #elif HAVE_MMX_EXTERNAL -# define emms_c avpriv_emms_yasm +# define emms_c avpriv_emms_asm #endif /* HAVE_MMX_INLINE */ #endif /* AVUTIL_X86_EMMS_H */ diff --git a/media/ffvpx/libavutil/x86/float_dsp.asm b/media/ffvpx/libavutil/x86/float_dsp.asm index 021ff03c8..06d2d2cfd 100644 --- a/media/ffvpx/libavutil/x86/float_dsp.asm +++ b/media/ffvpx/libavutil/x86/float_dsp.asm @@ -22,6 +22,9 @@ %include "x86util.asm" +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + SECTION .text ;----------------------------------------------------------------------------- @@ -149,6 +152,69 @@ INIT_XMM sse VECTOR_FMUL_SCALAR ;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, ; int len) ;------------------------------------------------------------------------------ @@ -177,8 +243,8 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len .loop: mulpd m1, m0, [srcq+lenq ] mulpd m2, m0, [srcq+lenq+mmsize] - mova [dstq+lenq ], m1 - mova [dstq+lenq+mmsize], m2 + movaps [dstq+lenq ], m1 + movaps [dstq+lenq+mmsize], m2 sub lenq, 2*mmsize jge .loop REP_RET @@ -296,10 +362,16 @@ VECTOR_FMUL_ADD ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_REVERSE 0 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + movaps m2, [pd_reverse] +%endif lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: -%if cpuflag(avx) +%if cpuflag(avx2) + vpermps m0, m2, [src1q] + vpermps m1, m2, [src1q+mmsize] +%elif cpuflag(avx) vmovaps xmm0, [src1q + 16] vinsertf128 m0, m0, [src1q], 1 vshufps m0, m0, m0, q0123 @@ -314,8 +386,8 @@ ALIGN 16 %endif mulps m0, m0, [src0q + lenq + mmsize] mulps m1, m1, [src0q + lenq] - mova [dstq + lenq + mmsize], m0 - mova [dstq + lenq], m1 + movaps [dstq + lenq + mmsize], m0 + movaps [dstq + lenq], m1 add src1q, 2*mmsize sub lenq, 2*mmsize jge .loop @@ -328,6 +400,10 @@ VECTOR_FMUL_REVERSE INIT_YMM avx VECTOR_FMUL_REVERSE %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) INIT_XMM sse diff --git a/media/ffvpx/libavutil/x86/float_dsp_init.c b/media/ffvpx/libavutil/x86/float_dsp_init.c index c836a78e1..122087a19 100644 --- a/media/ffvpx/libavutil/x86/float_dsp_init.c +++ b/media/ffvpx/libavutil/x86/float_dsp_init.c @@ -39,6 +39,13 @@ void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul, void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, int len); +void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul, + int len); + void ff_vector_dmul_scalar_sse2(double *dst, const double *src, double mul, int len); void ff_vector_dmul_scalar_avx(double *dst, const double *src, @@ -60,10 +67,12 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_avx(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, + const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); -void ff_butterflies_float_sse(float *src0, float *src1, int len); +void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len); av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) { @@ -83,17 +92,23 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->butterflies_float = ff_butterflies_float_sse; } if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; } if (EXTERNAL_AVX_FAST(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_avx; fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2; + } if (EXTERNAL_FMA3_FAST(cpu_flags)) { fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; } } diff --git a/media/ffvpx/libavutil/x86/imgutils.asm b/media/ffvpx/libavutil/x86/imgutils.asm new file mode 100644 index 000000000..3cca56cdc --- /dev/null +++ b/media/ffvpx/libavutil/x86/imgutils.asm @@ -0,0 +1,53 @@ +;***************************************************************************** +;* Copyright 2016 Anton Khirnov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +INIT_XMM sse4 +cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos + add dstq, bwq + add srcq, bwq + neg bwq + +.row_start: + mov rowposq, bwq + +.loop: + movntdqa m0, [srcq + rowposq + 0 * mmsize] + movntdqa m1, [srcq + rowposq + 1 * mmsize] + movntdqa m2, [srcq + rowposq + 2 * mmsize] + movntdqa m3, [srcq + rowposq + 3 * mmsize] + + mova [dstq + rowposq + 0 * mmsize], m0 + mova [dstq + rowposq + 1 * mmsize], m1 + mova [dstq + rowposq + 2 * mmsize], m2 + mova [dstq + rowposq + 3 * mmsize], m3 + + add rowposq, 4 * mmsize + jnz .loop + + add srcq, src_linesizeq + add dstq, dst_linesizeq + dec heightd + jnz .row_start + + RET diff --git a/media/ffvpx/libavutil/x86/imgutils_init.c b/media/ffvpx/libavutil/x86/imgutils_init.c new file mode 100644 index 000000000..4ea398205 --- /dev/null +++ b/media/ffvpx/libavutil/x86/imgutils_init.c @@ -0,0 +1,49 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/cpu.h" +#include "libavutil/error.h" +#include "libavutil/imgutils.h" +#include "libavutil/imgutils_internal.h" +#include "libavutil/internal.h" + +#include "cpu.h" + +void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize, + const uint8_t *src, ptrdiff_t src_linesize, + ptrdiff_t bytewidth, int height); + +int ff_image_copy_plane_uc_from_x86(uint8_t *dst, ptrdiff_t dst_linesize, + const uint8_t *src, ptrdiff_t src_linesize, + ptrdiff_t bytewidth, int height) +{ + int cpu_flags = av_get_cpu_flags(); + ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64); + + if (EXTERNAL_SSE4(cpu_flags) && + bw_aligned <= dst_linesize && bw_aligned <= src_linesize) + ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize, + bw_aligned, height); + else + return AVERROR(ENOSYS); + + return 0; +} diff --git a/media/ffvpx/libavutil/x86/moz.build b/media/ffvpx/libavutil/x86/moz.build index 1d1a6ca67..b56ed75ea 100644 --- a/media/ffvpx/libavutil/x86/moz.build +++ b/media/ffvpx/libavutil/x86/moz.build @@ -12,6 +12,8 @@ SOURCES += [ 'fixed_dsp_init.c', 'float_dsp.asm', 'float_dsp_init.c', + 'imgutils.asm', + 'imgutils_init.c', 'lls.asm', 'lls_init.c' ] diff --git a/media/ffvpx/libavutil/x86/x86inc.asm b/media/ffvpx/libavutil/x86/x86inc.asm index b2e9c6019..6a054a3e0 100644 --- a/media/ffvpx/libavutil/x86/x86inc.asm +++ b/media/ffvpx/libavutil/x86/x86inc.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2016 x264 project +;* Copyright (C) 2005-2017 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> @@ -87,7 +87,9 @@ ; keep supporting OS/2. %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,aout - section .text + SECTION .text + %elifidn __OUTPUT_FORMAT__,coff + SECTION .text %else SECTION .rodata align=%1 %endif @@ -385,7 +387,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) + %if ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif %endif %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) @@ -419,10 +428,10 @@ DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -468,41 +477,42 @@ DECLARE_REG 14, R15, 120 WIN64_PUSH_XMM %endmacro -%macro WIN64_RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %if xmm_regs_used > 8 %assign %%i xmm_regs_used %rep xmm_regs_used-8 %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else - add %1, stack_size_padded + add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 %macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if mmsize == 32 vzeroupper @@ -523,10 +533,10 @@ DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -618,7 +628,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro - %macro WIN64_RESTORE_XMM 1 + %macro WIN64_RESTORE_XMM 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro @@ -629,7 +639,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 - %if has_epilogue + %if has_epilogue || cpuflag(ssse3) RET %else rep ret @@ -780,25 +790,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 -%assign cpuflags_aesni (1<<24)|cpuflags_sse42 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 + +%assign cpuflags_cache32 (1<<20) +%assign cpuflags_cache64 (1<<21) +%assign cpuflags_slowctz (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -1030,7 +1040,11 @@ INIT_XMM ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - call_internal %1 %+ SUFFIX, %1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif %endmacro %macro call_internal 2 %xdefine %%i %2 diff --git a/media/ffvpx/libavutil/x86/x86util.asm b/media/ffvpx/libavutil/x86/x86util.asm index 44ed750ae..e1220dfc1 100644 --- a/media/ffvpx/libavutil/x86/x86util.asm +++ b/media/ffvpx/libavutil/x86/x86util.asm @@ -29,6 +29,21 @@ %include "libavutil/x86/x86inc.asm" +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base3], \ + [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4] + +; Interleave low src0 with low src1 and store in src0, +; interleave high src0 with high src1 and store in src1. +; %1 - types +; %2 - index of the register with src0 +; %3 - index of the register with src1 +; %4 - index of the register for intermediate results +; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3 +; src1: y0 y1 y2 y3 q0 q1 q2 q3 +; output: src0: x0 y0 x1 y1 x2 y2 x3 y3 +; src1: z0 q0 z1 q1 z2 q2 z3 q3 %macro SBUTTERFLY 4 %ifidn %1, dqqq vperm2i128 m%4, m%2, m%3, q0301 @@ -56,6 +71,12 @@ SWAP %1, %3, %2 %endmacro +%macro SBUTTERFLYPD 3 + movlhps m%3, m%1, m%2 + movhlps m%2, m%2, m%1 + SWAP %1, %3 +%endmacro + %macro TRANSPOSE4x4B 5 SBUTTERFLY bw, %1, %2, %5 SBUTTERFLY bw, %3, %4, %5 @@ -102,12 +123,9 @@ %macro TRANSPOSE4x4PS 5 SBUTTERFLYPS %1, %2, %5 SBUTTERFLYPS %3, %4, %5 - movlhps m%5, m%1, m%3 - movhlps m%3, m%1 - SWAP %5, %1 - movlhps m%5, m%2, m%4 - movhlps m%4, m%2 - SWAP %5, %2, %3 + SBUTTERFLYPD %1, %3, %5 + SBUTTERFLYPD %2, %4, %5 + SWAP %2, %3 %endmacro %macro TRANSPOSE8x4D 9-11 @@ -260,6 +278,21 @@ SWAP %12, %15 %endmacro +%macro TRANSPOSE_8X8B 8 + %if mmsize == 8 + %error "This macro does not support mmsize == 8" + %endif + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + punpcklbw m%5, m%6 + punpcklbw m%7, m%8 + TRANSPOSE4x4W %1, %3, %5, %7, %2 + MOVHL m%2, m%1 + MOVHL m%4, m%3 + MOVHL m%6, m%5 + MOVHL m%8, m%7 +%endmacro + ; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place %macro PABSW 2 %if cpuflag(ssse3) @@ -799,12 +832,25 @@ pmaxsd %1, %2 %endmacro -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse - movss %1, %2 - shufps %1, %1, 0 +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif %endif %endmacro @@ -819,6 +865,21 @@ %endif %endmacro +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 @@ -871,3 +932,79 @@ psrlq %1, 8*(%2) %endif %endmacro + +%macro MOVHL 2 ; dst, src +%ifidn %1, %2 + punpckhqdq %1, %2 +%elif cpuflag(avx) + punpckhqdq %1, %2, %2 +%elif cpuflag(sse4) + pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones +%else + movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst +%endif +%endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro |