11 files changed, 449 insertions, 64 deletions
diff --git a/media/ffvpx/libavutil/x86/cpu.c b/media/ffvpx/libavutil/x86/cpu.c
index f3a49c677..f33088c8c 100644
--- a/media/ffvpx/libavutil/x86/cpu.c
+++ b/media/ffvpx/libavutil/x86/cpu.c
@@ -28,7 +28,7 @@
 #include "libavutil/cpu.h"
 #include "libavutil/cpu_internal.h"
 
-#if HAVE_YASM
+#if HAVE_X86ASM
 
 #define cpuid(index, eax, ebx, ecx, edx)        \
     ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
@@ -66,7 +66,7 @@
 
 #define cpuid_test() 1
 
-#elif HAVE_YASM
+#elif HAVE_X86ASM
 
 #define cpuid_test ff_cpu_cpuid_test
 
@@ -221,9 +221,42 @@ int ff_get_cpu_flags_x86(void)
          * functions on the Atom. */
         if (family == 6 && model == 28)
             rval |= AV_CPU_FLAG_ATOM;
+
+        /* Conroe has a slow shuffle unit. Check the model number to ensure not
+         * to include crippled low-end Penryns and Nehalems that lack SSE4. */
+        if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) &&
+            family == 6 && model < 23)
+            rval |= AV_CPU_FLAG_SSSE3SLOW;
     }
 
 #endif /* cpuid */
 
     return rval;
 }
+
+size_t ff_get_cpu_max_align_x86(void)
+{
+    int flags = av_get_cpu_flags();
+
+    if (flags & (AV_CPU_FLAG_AVX2      |
+                 AV_CPU_FLAG_AVX       |
+                 AV_CPU_FLAG_XOP       |
+                 AV_CPU_FLAG_FMA4      |
+                 AV_CPU_FLAG_FMA3      |
+                 AV_CPU_FLAG_AVXSLOW))
+        return 32;
+    if (flags & (AV_CPU_FLAG_AESNI     |
+                 AV_CPU_FLAG_SSE42     |
+                 AV_CPU_FLAG_SSE4      |
+                 AV_CPU_FLAG_SSSE3     |
+                 AV_CPU_FLAG_SSE3      |
+                 AV_CPU_FLAG_SSE2      |
+                 AV_CPU_FLAG_SSE       |
+                 AV_CPU_FLAG_ATOM      |
+                 AV_CPU_FLAG_SSSE3SLOW |
+                 AV_CPU_FLAG_SSE3SLOW  |
+                 AV_CPU_FLAG_SSE2SLOW))
+        return 16;
+
+    return 8;
+}
diff --git a/media/ffvpx/libavutil/x86/cpu.h b/media/ffvpx/libavutil/x86/cpu.h
index f171037f1..309b8e746 100644
--- a/media/ffvpx/libavutil/x86/cpu.h
+++ b/media/ffvpx/libavutil/x86/cpu.h
@@ -38,6 +38,8 @@
 #define X86_SSE3_FAST(flags)        CPUEXT_FAST(flags, SSE3)
 #define X86_SSE3_SLOW(flags)        CPUEXT_SLOW(flags, SSE3)
 #define X86_SSSE3(flags)            CPUEXT(flags, SSSE3)
+#define X86_SSSE3_FAST(flags)       CPUEXT_FAST(flags, SSSE3)
+#define X86_SSSE3_SLOW(flags)       CPUEXT_SLOW(flags, SSSE3)
 #define X86_SSE4(flags)             CPUEXT(flags, SSE4)
 #define X86_SSE42(flags)            CPUEXT(flags, SSE42)
 #define X86_AVX(flags)              CPUEXT(flags, AVX)
@@ -61,6 +63,8 @@
 #define EXTERNAL_SSE3_FAST(flags)   CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3)
 #define EXTERNAL_SSE3_SLOW(flags)   CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3)
 #define EXTERNAL_SSSE3(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3)
+#define EXTERNAL_SSSE3_FAST(flags)  CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3)
+#define EXTERNAL_SSSE3_SLOW(flags)  CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3)
 #define EXTERNAL_SSE4(flags)        CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4)
 #define EXTERNAL_SSE42(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42)
 #define EXTERNAL_AVX(flags)         CPUEXT_SUFFIX(flags, _EXTERNAL, AVX)
@@ -88,6 +92,8 @@
 #define INLINE_SSE3_FAST(flags)     CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3)
 #define INLINE_SSE3_SLOW(flags)     CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3)
 #define INLINE_SSSE3(flags)         CPUEXT_SUFFIX(flags, _INLINE, SSSE3)
+#define INLINE_SSSE3_FAST(flags)    CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3)
+#define INLINE_SSSE3_SLOW(flags)    CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3)
 #define INLINE_SSE4(flags)          CPUEXT_SUFFIX(flags, _INLINE, SSE4)
 #define INLINE_SSE42(flags)         CPUEXT_SUFFIX(flags, _INLINE, SSE42)
 #define INLINE_AVX(flags)           CPUEXT_SUFFIX(flags, _INLINE, AVX)
diff --git a/media/ffvpx/libavutil/x86/emms.asm b/media/ffvpx/libavutil/x86/emms.asm
index 0aad34af3..8611762d7 100644
--- a/media/ffvpx/libavutil/x86/emms.asm
+++ b/media/ffvpx/libavutil/x86/emms.asm
@@ -23,8 +23,8 @@
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void avpriv_emms_yasm(void)
+; void avpriv_emms_asm(void)
 ;-----------------------------------------------------------------------------
-cvisible emms_yasm, 0, 0
+cvisible emms_asm, 0, 0
     emms
     RET
diff --git a/media/ffvpx/libavutil/x86/emms.h b/media/ffvpx/libavutil/x86/emms.h
index 42c18e295..c21e34b45 100644
--- a/media/ffvpx/libavutil/x86/emms.h
+++ b/media/ffvpx/libavutil/x86/emms.h
@@ -23,7 +23,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 
-void avpriv_emms_yasm(void);
+void avpriv_emms_asm(void);
 
 #if HAVE_MMX_INLINE
 #   define emms_c emms_c
@@ -49,7 +49,7 @@ static av_always_inline void emms_c(void)
 #   include <mmintrin.h>
 #   define emms_c _mm_empty
 #elif HAVE_MMX_EXTERNAL
-#   define emms_c avpriv_emms_yasm
+#   define emms_c avpriv_emms_asm
 #endif /* HAVE_MMX_INLINE */
 
 #endif /* AVUTIL_X86_EMMS_H */
diff --git a/media/ffvpx/libavutil/x86/float_dsp.asm b/media/ffvpx/libavutil/x86/float_dsp.asm
index 021ff03c8..06d2d2cfd 100644
--- a/media/ffvpx/libavutil/x86/float_dsp.asm
+++ b/media/ffvpx/libavutil/x86/float_dsp.asm
@@ -22,6 +22,9 @@
 
 %include "x86util.asm"
 
+SECTION_RODATA 32
+pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -149,6 +152,69 @@ INIT_XMM sse
 VECTOR_FMUL_SCALAR
 
 ;------------------------------------------------------------------------------
+; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
+;                            int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_DMAC_SCALAR 0
+%if ARCH_X86_32
+cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
+    mov          lenq, lenaddrm
+    VBROADCASTSD m0, mulm
+%else
+%if UNIX64
+cglobal vector_dmac_scalar, 3,3,5, dst, src, len
+%else
+cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
+    SWAP 0, 2
+%endif
+    movlhps     xm0, xm0
+%if cpuflag(avx)
+    vinsertf128  m0, m0, xm0, 1
+%endif
+%endif
+    lea    lenq, [lend*8-mmsize*4]
+.loop:
+%if cpuflag(fma3)
+    movaps   m1,     [dstq+lenq]
+    movaps   m2,     [dstq+lenq+1*mmsize]
+    movaps   m3,     [dstq+lenq+2*mmsize]
+    movaps   m4,     [dstq+lenq+3*mmsize]
+    fmaddpd  m1, m0, [srcq+lenq], m1
+    fmaddpd  m2, m0, [srcq+lenq+1*mmsize], m2
+    fmaddpd  m3, m0, [srcq+lenq+2*mmsize], m3
+    fmaddpd  m4, m0, [srcq+lenq+3*mmsize], m4
+%else ; cpuflag
+    mulpd    m1, m0, [srcq+lenq]
+    mulpd    m2, m0, [srcq+lenq+1*mmsize]
+    mulpd    m3, m0, [srcq+lenq+2*mmsize]
+    mulpd    m4, m0, [srcq+lenq+3*mmsize]
+    addpd    m1, m1, [dstq+lenq]
+    addpd    m2, m2, [dstq+lenq+1*mmsize]
+    addpd    m3, m3, [dstq+lenq+2*mmsize]
+    addpd    m4, m4, [dstq+lenq+3*mmsize]
+%endif ; cpuflag
+    movaps [dstq+lenq], m1
+    movaps [dstq+lenq+1*mmsize], m2
+    movaps [dstq+lenq+2*mmsize], m3
+    movaps [dstq+lenq+3*mmsize], m4
+    sub    lenq, mmsize*4
+    jge .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_DMAC_SCALAR
+%endif
+
+;------------------------------------------------------------------------------
 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
 ;                            int len)
 ;------------------------------------------------------------------------------
@@ -177,8 +243,8 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
 .loop:
     mulpd          m1, m0, [srcq+lenq       ]
     mulpd          m2, m0, [srcq+lenq+mmsize]
-    mova   [dstq+lenq       ], m1
-    mova   [dstq+lenq+mmsize], m2
+    movaps [dstq+lenq       ], m1
+    movaps [dstq+lenq+mmsize], m2
     sub          lenq, 2*mmsize
     jge .loop
     REP_RET
@@ -296,10 +362,16 @@ VECTOR_FMUL_ADD
 ;-----------------------------------------------------------------------------
 %macro VECTOR_FMUL_REVERSE 0
 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
+%if cpuflag(avx2)
+    movaps  m2, [pd_reverse]
+%endif
     lea       lenq, [lend*4 - 2*mmsize]
 ALIGN 16
 .loop:
-%if cpuflag(avx)
+%if cpuflag(avx2)
+    vpermps m0, m2, [src1q]
+    vpermps m1, m2, [src1q+mmsize]
+%elif cpuflag(avx)
     vmovaps     xmm0, [src1q + 16]
     vinsertf128 m0, m0, [src1q], 1
     vshufps     m0, m0, m0, q0123
@@ -314,8 +386,8 @@ ALIGN 16
 %endif
     mulps   m0, m0, [src0q + lenq + mmsize]
     mulps   m1, m1, [src0q + lenq]
-    mova    [dstq + lenq + mmsize], m0
-    mova    [dstq + lenq], m1
+    movaps  [dstq + lenq + mmsize], m0
+    movaps  [dstq + lenq], m1
     add     src1q, 2*mmsize
     sub     lenq,  2*mmsize
     jge     .loop
@@ -328,6 +400,10 @@ VECTOR_FMUL_REVERSE
 INIT_YMM avx
 VECTOR_FMUL_REVERSE
 %endif
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VECTOR_FMUL_REVERSE
+%endif
 
 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
 INIT_XMM sse
diff --git a/media/ffvpx/libavutil/x86/float_dsp_init.c b/media/ffvpx/libavutil/x86/float_dsp_init.c
index c836a78e1..122087a19 100644
--- a/media/ffvpx/libavutil/x86/float_dsp_init.c
+++ b/media/ffvpx/libavutil/x86/float_dsp_init.c
@@ -39,6 +39,13 @@ void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
 void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                                int len);
 
+void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul,
+                                int len);
+void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul,
+                               int len);
+void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul,
+                                int len);
+
 void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
                                 double mul, int len);
 void ff_vector_dmul_scalar_avx(double *dst, const double *src,
@@ -60,10 +67,12 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
                                 const float *src1, int len);
 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
                                 const float *src1, int len);
+void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
+                                 const float *src1, int len);
 
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 
-void ff_butterflies_float_sse(float *src0, float *src1, int len);
+void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
 
 av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 {
@@ -83,17 +92,23 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->butterflies_float   = ff_butterflies_float_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
     }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         fdsp->vector_fmul = ff_vector_fmul_avx;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
+        fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
     }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
+    }
     if (EXTERNAL_FMA3_FAST(cpu_flags)) {
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_fma3;
+        fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
     }
 }
diff --git a/media/ffvpx/libavutil/x86/imgutils.asm b/media/ffvpx/libavutil/x86/imgutils.asm
new file mode 100644
index 000000000..3cca56cdc
--- /dev/null
+++ b/media/ffvpx/libavutil/x86/imgutils.asm
@@ -0,0 +1,53 @@
+;*****************************************************************************
+;* Copyright 2016 Anton Khirnov
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos
+    add dstq, bwq
+    add srcq, bwq
+    neg bwq
+
+.row_start:
+    mov rowposq, bwq
+
+.loop:
+    movntdqa m0, [srcq + rowposq + 0 * mmsize]
+    movntdqa m1, [srcq + rowposq + 1 * mmsize]
+    movntdqa m2, [srcq + rowposq + 2 * mmsize]
+    movntdqa m3, [srcq + rowposq + 3 * mmsize]
+
+    mova [dstq + rowposq + 0 * mmsize], m0
+    mova [dstq + rowposq + 1 * mmsize], m1
+    mova [dstq + rowposq + 2 * mmsize], m2
+    mova [dstq + rowposq + 3 * mmsize], m3
+
+    add rowposq, 4 * mmsize
+    jnz .loop
+
+    add srcq, src_linesizeq
+    add dstq, dst_linesizeq
+    dec heightd
+    jnz .row_start
+
+    RET
diff --git a/media/ffvpx/libavutil/x86/imgutils_init.c b/media/ffvpx/libavutil/x86/imgutils_init.c
new file mode 100644
index 000000000..4ea398205
--- /dev/null
+++ b/media/ffvpx/libavutil/x86/imgutils_init.c
@@ -0,0 +1,49 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/cpu.h"
+#include "libavutil/error.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/imgutils_internal.h"
+#include "libavutil/internal.h"
+
+#include "cpu.h"
+
+void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize,
+                                      const uint8_t *src, ptrdiff_t src_linesize,
+                                      ptrdiff_t bytewidth, int height);
+
+int ff_image_copy_plane_uc_from_x86(uint8_t       *dst, ptrdiff_t dst_linesize,
+                                    const uint8_t *src, ptrdiff_t src_linesize,
+                                    ptrdiff_t bytewidth, int height)
+{
+    int cpu_flags = av_get_cpu_flags();
+    ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64);
+
+    if (EXTERNAL_SSE4(cpu_flags) &&
+        bw_aligned <= dst_linesize && bw_aligned <= src_linesize)
+        ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize,
+                                         bw_aligned, height);
+    else
+        return AVERROR(ENOSYS);
+
+    return 0;
+}
diff --git a/media/ffvpx/libavutil/x86/moz.build b/media/ffvpx/libavutil/x86/moz.build
index 1d1a6ca67..b56ed75ea 100644
--- a/media/ffvpx/libavutil/x86/moz.build
+++ b/media/ffvpx/libavutil/x86/moz.build
@@ -12,6 +12,8 @@ SOURCES += [
     'fixed_dsp_init.c',
     'float_dsp.asm',
     'float_dsp_init.c',
+    'imgutils.asm',
+    'imgutils_init.c',
     'lls.asm',
     'lls_init.c'
 ]
diff --git a/media/ffvpx/libavutil/x86/x86inc.asm b/media/ffvpx/libavutil/x86/x86inc.asm
index b2e9c6019..6a054a3e0 100644
--- a/media/ffvpx/libavutil/x86/x86inc.asm
+++ b/media/ffvpx/libavutil/x86/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2017 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -87,7 +87,9 @@
 ; keep supporting OS/2.
 %macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,aout
-        section .text
+        SECTION .text
+    %elifidn __OUTPUT_FORMAT__,coff
+        SECTION .text
     %else
         SECTION .rodata align=%1
     %endif
@@ -385,7 +387,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %ifnum %1
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
+                ; Reserve an additional register for storing the original stack pointer, but avoid using
+                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
                 %assign regs_used (regs_used + 1)
+                %if ARCH_X86_64 && regs_used == 7
+                    %assign regs_used 8
+                %elif ARCH_X86_64 == 0 && regs_used == 1
+                    %assign regs_used 2
+                %endif
             %endif
             %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
                 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
@@ -419,10 +428,10 @@ DECLARE_REG 7,  rdi, 64
 DECLARE_REG 8,  rsi, 72
 DECLARE_REG 9,  rbx, 80
 DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
 
 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
@@ -468,41 +477,42 @@ DECLARE_REG 14, R15, 120
     WIN64_PUSH_XMM
 %endmacro
 
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
     %if xmm_regs_used > 8
         %assign %%i xmm_regs_used
         %rep xmm_regs_used-8
             %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
             mov rsp, rstkm
         %else
-            add %1, stack_size_padded
+            add rsp, stack_size_padded
             %assign %%pad_size stack_size_padded
         %endif
     %endif
     %if xmm_regs_used > 7
-        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
     %if xmm_regs_used > 6
-        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
 
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+    WIN64_RESTORE_XMM_INTERNAL
     %assign stack_offset (stack_offset-stack_size_padded)
+    %assign stack_size_padded 0
     %assign xmm_regs_used 0
 %endmacro
 
 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
+    WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
     %if mmsize == 32
         vzeroupper
@@ -523,10 +533,10 @@ DECLARE_REG 7,  R10, 16
 DECLARE_REG 8,  R11, 24
 DECLARE_REG 9,  rbx, 32
 DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
 
 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
@@ -618,7 +628,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if WIN64 == 0
     %macro WIN64_SPILL_XMM 1
     %endmacro
-    %macro WIN64_RESTORE_XMM 1
+    %macro WIN64_RESTORE_XMM 0
     %endmacro
     %macro WIN64_PUSH_XMM 0
     %endmacro
@@ -629,7 +639,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 ; We can automatically detect "follows a branch", but not a branch target.
 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
-    %if has_epilogue
+    %if has_epilogue || cpuflag(ssse3)
         RET
     %else
         rep ret
@@ -780,25 +790,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_fma3     (1<<14)| cpuflags_avx
-%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
-%assign cpuflags_aesni    (1<<24)|cpuflags_sse42
+%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
+%assign cpuflags_avx      (1<<13)| cpuflags_sse42
+%assign cpuflags_xop      (1<<14)| cpuflags_avx
+%assign cpuflags_fma4     (1<<15)| cpuflags_avx
+%assign cpuflags_fma3     (1<<16)| cpuflags_avx
+%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
+%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
+
+%assign cpuflags_cache32  (1<<20)
+%assign cpuflags_cache64  (1<<21)
+%assign cpuflags_slowctz  (1<<22)
+%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<24)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -1030,7 +1040,11 @@ INIT_XMM
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1 %+ SUFFIX, %1
+    %ifid %1
+        call_internal %1 %+ SUFFIX, %1
+    %else
+        call %1
+    %endif
 %endmacro
 %macro call_internal 2
     %xdefine %%i %2
diff --git a/media/ffvpx/libavutil/x86/x86util.asm b/media/ffvpx/libavutil/x86/x86util.asm
index 44ed750ae..e1220dfc1 100644
--- a/media/ffvpx/libavutil/x86/x86util.asm
+++ b/media/ffvpx/libavutil/x86/x86util.asm
@@ -29,6 +29,21 @@
 
 %include "libavutil/x86/x86inc.asm"
 
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+    [base],           [base  + stride],   [base  + 2*stride], [base3], \
+    [base3 + stride], [base3 + 2*stride], [base3 + stride3],  [base3 + stride*4]
+
+; Interleave low src0 with low src1 and store in src0,
+; interleave high src0 with high src1 and store in src1.
+; %1 - types
+; %2 - index of the register with src0
+; %3 - index of the register with src1
+; %4 - index of the register for intermediate results
+; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3
+;                             src1: y0 y1 y2 y3 q0 q1 q2 q3
+;                     output: src0: x0 y0 x1 y1 x2 y2 x3 y3
+;                             src1: z0 q0 z1 q1 z2 q2 z3 q3
 %macro SBUTTERFLY 4
 %ifidn %1, dqqq
     vperm2i128  m%4, m%2, m%3, q0301
@@ -56,6 +71,12 @@
     SWAP %1, %3, %2
 %endmacro
 
+%macro SBUTTERFLYPD 3
+    movlhps m%3, m%1, m%2
+    movhlps m%2, m%2, m%1
+    SWAP %1, %3
+%endmacro
+
 %macro TRANSPOSE4x4B 5
     SBUTTERFLY bw, %1, %2, %5
     SBUTTERFLY bw, %3, %4, %5
@@ -102,12 +123,9 @@
 %macro TRANSPOSE4x4PS 5
     SBUTTERFLYPS %1, %2, %5
     SBUTTERFLYPS %3, %4, %5
-    movlhps m%5, m%1, m%3
-    movhlps m%3, m%1
-    SWAP %5, %1
-    movlhps m%5, m%2, m%4
-    movhlps m%4, m%2
-    SWAP %5, %2, %3
+    SBUTTERFLYPD %1, %3, %5
+    SBUTTERFLYPD %2, %4, %5
+    SWAP %2, %3
 %endmacro
 
 %macro TRANSPOSE8x4D 9-11
@@ -260,6 +278,21 @@
     SWAP       %12, %15
 %endmacro
 
+%macro TRANSPOSE_8X8B 8
+    %if mmsize == 8
+        %error "This macro does not support mmsize == 8"
+    %endif
+    punpcklbw m%1, m%2
+    punpcklbw m%3, m%4
+    punpcklbw m%5, m%6
+    punpcklbw m%7, m%8
+    TRANSPOSE4x4W %1, %3, %5, %7, %2
+    MOVHL m%2, m%1
+    MOVHL m%4, m%3
+    MOVHL m%6, m%5
+    MOVHL m%8, m%7
+%endmacro
+
 ; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
 %macro PABSW 2
 %if cpuflag(ssse3)
@@ -799,12 +832,25 @@
     pmaxsd  %1, %2
 %endmacro
 
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
-    vbroadcastss %1, %2
-%else ; sse
-    movss        %1, %2
-    shufps       %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vbroadcastss  %1, %2
+%elif cpuflag(avx)
+    %ifnum sizeof%2         ; avx1 register
+        shufps  xmm%1, xmm%2, xmm%2, q0000
+        %if sizeof%1 >= 32  ; mmsize>=32
+            vinsertf128  %1, %1, xmm%1, 1
+        %endif
+    %else                   ; avx1 memory
+        vbroadcastss  %1, %2
+    %endif
+%else
+    %ifnum sizeof%2         ; sse register
+        shufps  %1, %2, %2, q0000
+    %else                   ; sse memory
+        movss   %1, %2
+        shufps  %1, %1, 0
+    %endif
 %endif
 %endmacro
 
@@ -819,6 +865,21 @@
 %endif
 %endmacro
 
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vpbroadcastd  %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+    %ifnum sizeof%2         ; sse2 register
+        pshufd  %1, %2, q0000
+    %else                   ; sse memory
+        movd    %1, %2
+        pshufd  %1, %1, 0
+    %endif
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
@@ -871,3 +932,79 @@
     psrlq   %1, 8*(%2)
 %endif
 %endmacro
+
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+    punpckhqdq %1, %2
+%elif cpuflag(avx)
+    punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+    pshufd     %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+    %if sizeof%1>=32  ; avx
+        vperm2f128  %2, %1, %1, (0)*16+(1)
+        addps       %1, %2
+    %endif
+    shufps      %2, %1, %1, q1032
+    addps       %1, %2
+    shufps      %2, %1, %1, q0321
+    addps       %1, %2
+%else  ; this form is a bit faster than the short avx-like emulation.
+    movaps      %2, %1
+    shufps      %1, %1, q1032
+    addps       %1, %2
+    movaps      %2, %1
+    shufps      %1, %1, q0321
+    addps       %1, %2
+    ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    blendvps  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+    %endif
+    blendvps  %1, %2, %3
+%else
+    xorps  %2, %1
+    andps  %2, %3
+    xorps  %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error pblendb not possible with ymm on avx1, try blendvps.
+    %endif
+    pblendvb  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+    %endif
+    pblendvb  %1, %2, %3
+%else
+    pxor  %2, %1
+    pand  %2, %3
+    pxor  %1, %2
+%endif
+%endmacro