summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavcodec/x86
diff options
context:
space:
mode:
authorjanekptacijarabaci <janekptacijarabaci@seznam.cz>2018-04-28 07:38:20 +0200
committerjanekptacijarabaci <janekptacijarabaci@seznam.cz>2018-04-28 07:38:20 +0200
commit6b2bba06b433fb75979ab6daf7bbe8bc76c53875 (patch)
tree75803b4683889e6e0d2a3faef30415db3ff7b3ba /media/ffvpx/libavcodec/x86
parent72def35cd0cf3649b6d7ab72b66117df3e1c33fc (diff)
parentc75dae3ed21bfa5a8ae46cd83d18329af5bea05a (diff)
downloadUXP-6b2bba06b433fb75979ab6daf7bbe8bc76c53875.tar
UXP-6b2bba06b433fb75979ab6daf7bbe8bc76c53875.tar.gz
UXP-6b2bba06b433fb75979ab6daf7bbe8bc76c53875.tar.lz
UXP-6b2bba06b433fb75979ab6daf7bbe8bc76c53875.tar.xz
UXP-6b2bba06b433fb75979ab6daf7bbe8bc76c53875.zip
Merge branch 'master' of https://github.com/MoonchildProductions/UXP into pm_url_1
Diffstat (limited to 'media/ffvpx/libavcodec/x86')
-rw-r--r--media/ffvpx/libavcodec/x86/constants.h1
-rw-r--r--media/ffvpx/libavcodec/x86/flacdsp_init.c4
-rw-r--r--media/ffvpx/libavcodec/x86/h264_i386.h212
-rw-r--r--media/ffvpx/libavcodec/x86/h264_intrapred.asm102
-rw-r--r--media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm77
-rw-r--r--media/ffvpx/libavcodec/x86/h264_intrapred_init.c7
-rw-r--r--media/ffvpx/libavcodec/x86/videodsp.asm2
-rw-r--r--media/ffvpx/libavcodec/x86/videodsp_init.c8
-rw-r--r--media/ffvpx/libavcodec/x86/vp8dsp.asm31
-rw-r--r--media/ffvpx/libavcodec/x86/vp8dsp_init.c21
-rw-r--r--media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm6
-rw-r--r--media/ffvpx/libavcodec/x86/vp9dsp_init.c16
-rw-r--r--media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c18
-rw-r--r--media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c8
-rw-r--r--media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm257
-rw-r--r--media/ffvpx/libavcodec/x86/vp9itxfm.asm188
-rw-r--r--media/ffvpx/libavcodec/x86/vp9mc.asm7
17 files changed, 632 insertions, 333 deletions
diff --git a/media/ffvpx/libavcodec/x86/constants.h b/media/ffvpx/libavcodec/x86/constants.h
index b82aef9a4..bbb0ef844 100644
--- a/media/ffvpx/libavcodec/x86/constants.h
+++ b/media/ffvpx/libavcodec/x86/constants.h
@@ -43,6 +43,7 @@ extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const ymm_reg ff_pw_255;
+extern const ymm_reg ff_pw_256;
extern const ymm_reg ff_pw_512;
extern const ymm_reg ff_pw_1023;
extern const ymm_reg ff_pw_1024;
diff --git a/media/ffvpx/libavcodec/x86/flacdsp_init.c b/media/ffvpx/libavcodec/x86/flacdsp_init.c
index e28c5c932..1971f81b8 100644
--- a/media/ffvpx/libavcodec/x86/flacdsp_init.c
+++ b/media/ffvpx/libavcodec/x86/flacdsp_init.c
@@ -53,7 +53,7 @@ DECORRELATE_FUNCS(32, avx);
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
int bps)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if CONFIG_FLAC_DECODER
@@ -111,5 +111,5 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
}
#endif
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
diff --git a/media/ffvpx/libavcodec/x86/h264_i386.h b/media/ffvpx/libavcodec/x86/h264_i386.h
deleted file mode 100644
index 19cd12838..000000000
--- a/media/ffvpx/libavcodec/x86/h264_i386.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG-4 part10 codec.
- * non-MMX i386-specific optimizations for H.264
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_X86_H264_I386_H
-#define AVCODEC_X86_H264_I386_H
-
-#include <stddef.h>
-
-#include "libavcodec/cabac.h"
-#include "cabac.h"
-
-#if HAVE_INLINE_ASM
-
-#if ARCH_X86_64
-#define REG64 "r"
-#else
-#define REG64 "m"
-#endif
-
-//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
-//as that would make optimization work hard)
-#if HAVE_7REGS && !BROKEN_COMPILER
-#define decode_significance decode_significance_x86
-static int decode_significance_x86(CABACContext *c, int max_coeff,
- uint8_t *significant_coeff_ctx_base,
- int *index, x86_reg last_off){
- void *end= significant_coeff_ctx_base + max_coeff - 1;
- int minusstart= -(intptr_t)significant_coeff_ctx_base;
- int minusindex= 4-(intptr_t)index;
- int bit;
- x86_reg coeff_count;
-
-#ifdef BROKEN_RELOCATIONS
- void *tables;
-
- __asm__ volatile(
- "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
- : "=&r"(tables)
- : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
- );
-#endif
-
- __asm__ volatile(
- "3: \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c11(%6)", "%c12(%6)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%13")
-
- "test $1, %4 \n\t"
- " jz 4f \n\t"
- "add %10, %1 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c11(%6)", "%c12(%6)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%13")
-
- "sub %10, %1 \n\t"
- "mov %2, %0 \n\t"
- "movl %7, %%ecx \n\t"
- "add %1, %%"FF_REG_c" \n\t"
- "movl %%ecx, (%0) \n\t"
-
- "test $1, %4 \n\t"
- " jnz 5f \n\t"
-
- "add"FF_OPSIZE" $4, %2 \n\t"
-
- "4: \n\t"
- "add $1, %1 \n\t"
- "cmp %8, %1 \n\t"
- " jb 3b \n\t"
- "mov %2, %0 \n\t"
- "movl %7, %%ecx \n\t"
- "add %1, %%"FF_REG_c" \n\t"
- "movl %%ecx, (%0) \n\t"
- "5: \n\t"
- "add %9, %k0 \n\t"
- "shr $2, %k0 \n\t"
- : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
- : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end))
- TABLES_ARG
- : "%"FF_REG_c, "memory"
- );
- return coeff_count;
-}
-
-#define decode_significance_8x8 decode_significance_8x8_x86
-static int decode_significance_8x8_x86(CABACContext *c,
- uint8_t *significant_coeff_ctx_base,
- int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
- int minusindex= 4-(intptr_t)index;
- int bit;
- x86_reg coeff_count;
- x86_reg last=0;
- x86_reg state;
-
-#ifdef BROKEN_RELOCATIONS
- void *tables;
-
- __asm__ volatile(
- "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
- : "=&r"(tables)
- : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
- );
-#endif
-
- __asm__ volatile(
- "mov %1, %6 \n\t"
- "3: \n\t"
-
- "mov %10, %0 \n\t"
- "movzb (%0, %6), %6 \n\t"
- "add %9, %6 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c12(%7)", "%c13(%7)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%15")
-
- "mov %1, %6 \n\t"
- "test $1, %4 \n\t"
- " jz 4f \n\t"
-
-#ifdef BROKEN_RELOCATIONS
- "movzb %c14(%15, %q6), %6\n\t"
-#else
- "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
-#endif
- "add %11, %6 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c12(%7)", "%c13(%7)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%15")
-
- "mov %2, %0 \n\t"
- "mov %1, %6 \n\t"
- "mov %k6, (%0) \n\t"
-
- "test $1, %4 \n\t"
- " jnz 5f \n\t"
-
- "add"FF_OPSIZE" $4, %2 \n\t"
-
- "4: \n\t"
- "add $1, %6 \n\t"
- "mov %6, %1 \n\t"
- "cmp $63, %6 \n\t"
- " jb 3b \n\t"
- "mov %2, %0 \n\t"
- "mov %k6, (%0) \n\t"
- "5: \n\t"
- "addl %8, %k0 \n\t"
- "shr $2, %k0 \n\t"
- : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
- "=&r"(bit), "+&r"(c->range), "=&r"(state)
- : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
- REG64(sig_off), REG64(last_coeff_ctx_base),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end)),
- "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
- : "%"FF_REG_c, "memory"
- );
- return coeff_count;
-}
-#endif /* HAVE_7REGS && BROKEN_COMPILER */
-
-#endif /* HAVE_INLINE_ASM */
-#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred.asm b/media/ffvpx/libavcodec/x86/h264_intrapred.asm
index c88d91b49..f3aa3172f 100644
--- a/media/ffvpx/libavcodec/x86/h264_intrapred.asm
+++ b/media/ffvpx/libavcodec/x86/h264_intrapred.asm
@@ -49,7 +49,7 @@ cextern pw_17
cextern pw_32
;-----------------------------------------------------------------------------
-; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
+; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmx
@@ -85,7 +85,7 @@ cglobal pred16x16_vertical_8, 2,3
REP_RET
;-----------------------------------------------------------------------------
-; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
+; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_H 0
@@ -126,7 +126,7 @@ INIT_XMM ssse3
PRED16x16_H
;-----------------------------------------------------------------------------
-; void ff_pred16x16_dc_8(uint8_t *src, int stride)
+; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 0
@@ -188,7 +188,7 @@ INIT_XMM ssse3
PRED16x16_DC
;-----------------------------------------------------------------------------
-; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
+; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_TM 0
@@ -268,8 +268,45 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
jg .loop
REP_RET
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+ sub dstq, strideq
+ pmovzxbw m0, [dstq]
+ vpbroadcastb xm1, [r0-1]
+ pmovzxbw m1, xm1
+ psubw m0, m1
+ mov iterationd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ vpbroadcastb xm1, [dstq+strideq*1-1]
+ vpbroadcastb xm2, [dstq+strideq*2-1]
+ vpbroadcastb xm3, [dstq+stride3q-1]
+ vpbroadcastb xm4, [dstq+strideq*4-1]
+ pmovzxbw m1, xm1
+ pmovzxbw m2, xm2
+ pmovzxbw m3, xm3
+ pmovzxbw m4, xm4
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+ vpackuswb m1, m1, m2
+ vpackuswb m3, m3, m4
+ vpermq m1, m1, q3120
+ vpermq m3, m3, q3120
+ movdqa [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m1, 1
+ movdqa [dstq+stride3q*1], xm3
+ vextracti128 [dstq+strideq*4], m3, 1
+ lea dstq, [dstq+strideq*4]
+ dec iterationd
+ jg .loop
+ REP_RET
+%endif
+
;-----------------------------------------------------------------------------
-; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
+; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro H264_PRED16x16_PLANE 1
@@ -550,7 +587,7 @@ H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
;-----------------------------------------------------------------------------
-; void ff_pred8x8_plane_8(uint8_t *src, int stride)
+; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro H264_PRED8x8_PLANE 0
@@ -724,7 +761,7 @@ INIT_XMM ssse3
H264_PRED8x8_PLANE
;-----------------------------------------------------------------------------
-; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
+; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmx
@@ -741,7 +778,7 @@ cglobal pred8x8_vertical_8, 2,2
RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
+; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8_H 0
@@ -769,7 +806,7 @@ INIT_MMX ssse3
PRED8x8_H
;-----------------------------------------------------------------------------
-; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
+; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8_top_dc_8, 2,5
@@ -803,7 +840,7 @@ cglobal pred8x8_top_dc_8, 2,5
RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
+; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -864,7 +901,7 @@ cglobal pred8x8_dc_8, 2,5
RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
+; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -901,7 +938,7 @@ cglobal pred8x8_dc_rv40_8, 2,7
REP_RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
+; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8_TM 0
@@ -1014,7 +1051,7 @@ cglobal pred8x8_tm_vp8_8, 2,3,6
;-----------------------------------------------------------------------------
; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_8, 4,4
@@ -1070,7 +1107,7 @@ PRED8x8L_TOP_DC
;-----------------------------------------------------------------------------
; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DC 0
@@ -1174,7 +1211,7 @@ PRED8x8L_DC
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL 0
@@ -1246,7 +1283,7 @@ PRED8x8L_HORIZONTAL
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL 0
@@ -1297,7 +1334,7 @@ PRED8x8L_VERTICAL
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -1498,7 +1535,7 @@ PRED8x8L_DOWN_LEFT
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -1750,7 +1787,7 @@ PRED8x8L_DOWN_RIGHT
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -1978,7 +2015,7 @@ PRED8x8L_VERTICAL_RIGHT
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_LEFT 0
@@ -2068,7 +2105,7 @@ PRED8x8L_VERTICAL_LEFT
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL_UP 0
@@ -2156,7 +2193,7 @@ PRED8x8L_HORIZONTAL_UP
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
-; int has_topright, int stride)
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2404,7 +2441,8 @@ INIT_MMX ssse3
PRED8x8L_HORIZONTAL_DOWN
;-------------------------------------------------------------------------------
-; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
;-------------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2435,7 +2473,7 @@ cglobal pred4x4_dc_8, 3,5
;-----------------------------------------------------------------------------
; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_TM 0
@@ -2514,7 +2552,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
;-----------------------------------------------------------------------------
; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2535,7 +2573,7 @@ cglobal pred4x4_vertical_vp8_8, 3,3
;-----------------------------------------------------------------------------
; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred4x4_down_left_8, 3,3
@@ -2562,7 +2600,7 @@ cglobal pred4x4_down_left_8, 3,3
;------------------------------------------------------------------------------
; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;------------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2588,7 +2626,7 @@ cglobal pred4x4_vertical_left_8, 3,3
;------------------------------------------------------------------------------
; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;------------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2622,7 +2660,8 @@ cglobal pred4x4_horizontal_up_8, 3,3
;------------------------------------------------------------------------------
; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
-; const uint8_t *topright, int stride)
+; const uint8_t *topright,
+; ptrdiff_t stride)
;------------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2658,7 +2697,8 @@ cglobal pred4x4_horizontal_down_8, 3,3
;-----------------------------------------------------------------------------
; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
-; const uint8_t *topright, int stride)
+; const uint8_t *topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -2689,7 +2729,7 @@ cglobal pred4x4_vertical_right_8, 3,3
;-----------------------------------------------------------------------------
; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
-; int stride)
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm
index 9e40cfe24..629e0a72e 100644
--- a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm
@@ -51,7 +51,8 @@ SECTION .text
%endmacro
;-----------------------------------------------------------------------------
-; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_DR 0
cglobal pred4x4_down_right_10, 3, 3
@@ -89,7 +90,8 @@ PRED4x4_DR
%endif
;------------------------------------------------------------------------------
-; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;------------------------------------------------------------------------------
%macro PRED4x4_VR 0
cglobal pred4x4_vertical_right_10, 3, 3, 6
@@ -128,7 +130,8 @@ PRED4x4_VR
%endif
;-------------------------------------------------------------------------------
-; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;-------------------------------------------------------------------------------
%macro PRED4x4_HD 0
cglobal pred4x4_horizontal_down_10, 3, 3
@@ -170,7 +173,7 @@ PRED4x4_HD
%endif
;-----------------------------------------------------------------------------
-; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
@@ -195,7 +198,8 @@ cglobal pred4x4_dc_10, 3, 3
RET
;-----------------------------------------------------------------------------
-; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_DL 0
cglobal pred4x4_down_left_10, 3, 3
@@ -225,7 +229,8 @@ PRED4x4_DL
%endif
;-----------------------------------------------------------------------------
-; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_VL 0
cglobal pred4x4_vertical_left_10, 3, 3
@@ -254,7 +259,8 @@ PRED4x4_VL
%endif
;-----------------------------------------------------------------------------
-; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
+; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred4x4_horizontal_up_10, 3, 3
@@ -288,7 +294,7 @@ cglobal pred4x4_horizontal_up_10, 3, 3
;-----------------------------------------------------------------------------
-; void ff_pred8x8_vertical(pixel *src, int stride)
+; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_vertical_10, 2, 2
@@ -304,7 +310,7 @@ cglobal pred8x8_vertical_10, 2, 2
RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_horizontal(pixel *src, int stride)
+; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_horizontal_10, 2, 3
@@ -324,7 +330,7 @@ cglobal pred8x8_horizontal_10, 2, 3
REP_RET
;-----------------------------------------------------------------------------
-; void ff_predict_8x8_dc(pixel *src, int stride)
+; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro MOV8 2-3
; sort of a hack, but it works
@@ -411,7 +417,7 @@ INIT_XMM sse2
PRED8x8_DC pshuflw
;-----------------------------------------------------------------------------
-; void ff_pred8x8_top_dc(pixel *src, int stride)
+; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_top_dc_10, 2, 4
@@ -438,7 +444,7 @@ cglobal pred8x8_top_dc_10, 2, 4
RET
;-----------------------------------------------------------------------------
-; void ff_pred8x8_plane(pixel *src, int stride)
+; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_plane_10, 2, 7, 7
@@ -501,8 +507,8 @@ cglobal pred8x8_plane_10, 2, 7, 7
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_128_DC 0
cglobal pred8x8l_128_dc_10, 4, 4
@@ -526,8 +532,8 @@ INIT_XMM sse2
PRED8x8L_128_DC
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_10, 4, 4, 6
@@ -566,7 +572,8 @@ PRED8x8L_TOP_DC
%endif
;-------------------------------------------------------------------------------
-; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
+; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
;-------------------------------------------------------------------------------
;TODO: see if scalar is faster
%macro PRED8x8L_DC 0
@@ -625,8 +632,8 @@ PRED8x8L_DC
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_10, 4, 4, 6
@@ -661,8 +668,8 @@ PRED8x8L_VERTICAL
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_10, 4, 4, 5
@@ -718,8 +725,8 @@ PRED8x8L_HORIZONTAL
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_10, 4, 4, 7
@@ -787,8 +794,8 @@ PRED8x8L_DOWN_LEFT
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
-; int stride)
+; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_RIGHT 0
; standard forbids this when has_topleft is false
@@ -862,8 +869,8 @@ PRED8x8L_DOWN_RIGHT
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
-; int has_topright, int stride)
+; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_RIGHT 0
; likewise with 8x8l_down_right
@@ -933,8 +940,8 @@ PRED8x8L_VERTICAL_RIGHT
%endif
;-----------------------------------------------------------------------------
-; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
-; int has_topright, int stride)
+; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_10, 4, 4, 6
@@ -996,7 +1003,7 @@ PRED8x8L_HORIZONTAL_UP
;-----------------------------------------------------------------------------
-; void ff_pred16x16_vertical(pixel *src, int stride)
+; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro MOV16 3-5
mova [%1+ 0], %2
@@ -1032,7 +1039,7 @@ INIT_XMM sse2
PRED16x16_VERTICAL
;-----------------------------------------------------------------------------
-; void ff_pred16x16_horizontal(pixel *src, int stride)
+; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_HORIZONTAL 0
cglobal pred16x16_horizontal_10, 2, 3
@@ -1056,7 +1063,7 @@ INIT_XMM sse2
PRED16x16_HORIZONTAL
;-----------------------------------------------------------------------------
-; void ff_pred16x16_dc(pixel *src, int stride)
+; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 0
cglobal pred16x16_dc_10, 2, 6
@@ -1102,7 +1109,7 @@ INIT_XMM sse2
PRED16x16_DC
;-----------------------------------------------------------------------------
-; void ff_pred16x16_top_dc(pixel *src, int stride)
+; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_TOP_DC 0
cglobal pred16x16_top_dc_10, 2, 3
@@ -1134,7 +1141,7 @@ INIT_XMM sse2
PRED16x16_TOP_DC
;-----------------------------------------------------------------------------
-; void ff_pred16x16_left_dc(pixel *src, int stride)
+; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_LEFT_DC 0
cglobal pred16x16_left_dc_10, 2, 6
@@ -1171,7 +1178,7 @@ INIT_XMM sse2
PRED16x16_LEFT_DC
;-----------------------------------------------------------------------------
-; void ff_pred16x16_128_dc(pixel *src, int stride)
+; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_128_DC 0
cglobal pred16x16_128_dc_10, 2,3
diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c
index 528b92e49..bdd5125d6 100644
--- a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c
+++ b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c
@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
}
}
+
+ if(EXTERNAL_AVX2(cpu_flags)){
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
+ }
+ }
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
diff --git a/media/ffvpx/libavcodec/x86/videodsp.asm b/media/ffvpx/libavcodec/x86/videodsp.asm
index a807d3b88..e23786070 100644
--- a/media/ffvpx/libavcodec/x86/videodsp.asm
+++ b/media/ffvpx/libavcodec/x86/videodsp.asm
@@ -114,7 +114,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
.x_loop: ; do {
movu [dstq+wq*2], m0 ; write($reg, $mmsize)
add wq, mmsize/2 ; w -= $mmsize/2
- cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
+ cmp wq, -(mmsize/2) ; } while (w > $mmsize/2)
jl .x_loop
movu [dstq-mmsize], m0 ; write($reg, $mmsize)
add dstq, dst_strideq ; dst += dst_stride
diff --git a/media/ffvpx/libavcodec/x86/videodsp_init.c b/media/ffvpx/libavcodec/x86/videodsp_init.c
index 26e072bb1..eeebb4154 100644
--- a/media/ffvpx/libavcodec/x86/videodsp_init.c
+++ b/media/ffvpx/libavcodec/x86/videodsp_init.c
@@ -29,7 +29,7 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/videodsp.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
@@ -271,14 +271,14 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
}
#endif /* HAVE_AVX2_EXTERNAL */
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
@@ -305,5 +305,5 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
ctx->emulated_edge_mc = emulated_edge_mc_avx2;
}
#endif
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
diff --git a/media/ffvpx/libavcodec/x86/vp8dsp.asm b/media/ffvpx/libavcodec/x86/vp8dsp.asm
index 538b3f4a9..e303b8029 100644
--- a/media/ffvpx/libavcodec/x86/vp8dsp.asm
+++ b/media/ffvpx/libavcodec/x86/vp8dsp.asm
@@ -156,8 +156,8 @@ SECTION .text
;-------------------------------------------------------------------------------
; subpel MC functions:
;
-; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
-; uint8_t *src, int srcstride,
+; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
+; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my);
;-------------------------------------------------------------------------------
@@ -884,7 +884,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
REP_RET
;-----------------------------------------------------------------------------
-; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
+; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
;-----------------------------------------------------------------------------
%macro ADD_DC 4
@@ -906,6 +906,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
%4 [dst2q+strideq+%3], m5
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data
@@ -929,8 +930,9 @@ cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh
RET
+%endif
-INIT_XMM sse4
+%macro VP8_IDCT_DC_ADD 0
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data
movd m0, [blockq]
@@ -956,13 +958,28 @@ cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
paddw m4, m0
packuswb m2, m4
movd [dst1q], m2
+%if cpuflag(sse4)
pextrd [dst1q+strideq], m2, 1
pextrd [dst2q], m2, 2
pextrd [dst2q+strideq], m2, 3
+%else
+ psrldq m2, 4
+ movd [dst1q+strideq], m2
+ psrldq m2, 4
+ movd [dst2q], m2
+ psrldq m2, 4
+ movd [dst2q+strideq], m2
+%endif
RET
+%endmacro
+
+INIT_XMM sse2
+VP8_IDCT_DC_ADD
+INIT_XMM sse4
+VP8_IDCT_DC_ADD
;-----------------------------------------------------------------------------
-; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
+; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
;-----------------------------------------------------------------------------
%if ARCH_X86_32
@@ -1035,7 +1052,7 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
RET
;-----------------------------------------------------------------------------
-; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
+; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
;-----------------------------------------------------------------------------
INIT_MMX mmx
@@ -1077,7 +1094,7 @@ cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
RET
;-----------------------------------------------------------------------------
-; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
+; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
;-----------------------------------------------------------------------------
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_init.c b/media/ffvpx/libavcodec/x86/vp8dsp_init.c
index 897d5a0e7..397b2518c 100644
--- a/media/ffvpx/libavcodec/x86/vp8dsp_init.c
+++ b/media/ffvpx/libavcodec/x86/vp8dsp_init.c
@@ -26,7 +26,7 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp8dsp.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
/*
* MC functions
@@ -233,6 +233,8 @@ HVBILIN(ssse3, 8, 16, 16)
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
@@ -288,7 +290,7 @@ DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
@@ -316,7 +318,7 @@ DECLARE_LOOP_FILTER(sse4)
av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
@@ -346,7 +348,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
- if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -361,18 +363,18 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
}
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
@@ -416,7 +418,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
}
- if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -427,6 +429,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
@@ -460,5 +463,5 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm
index 98bb6696a..caeb40526 100644
--- a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -43,7 +43,7 @@ cextern pb_80
SECTION .text
;-----------------------------------------------------------------------------
-; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
+; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim);
;-----------------------------------------------------------------------------
; macro called with 7 mm register indexes as argument, and 4 regular registers
@@ -429,7 +429,7 @@ INIT_XMM sse4
SIMPLE_LOOPFILTER h, 5
;-----------------------------------------------------------------------------
-; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
+; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
@@ -921,7 +921,7 @@ INNER_LOOPFILTER v, 8
INNER_LOOPFILTER h, 8
;-----------------------------------------------------------------------------
-; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
+; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.c b/media/ffvpx/libavcodec/x86/vp9dsp_init.c
index cc781a009..837cce850 100644
--- a/media/ffvpx/libavcodec/x86/vp9dsp_init.c
+++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.c
@@ -27,7 +27,7 @@
#include "libavcodec/vp9dsp.h"
#include "libavcodec/x86/vp9dsp_init.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
decl_fpel_func(put, 4, , mmx);
decl_fpel_func(put, 8, , mmx);
@@ -114,7 +114,7 @@ itxfm_func(idct, idct, 32, sse2);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
-itxfm_func(idct, idct, 16, avx2);
+itxfm_funcs(16, avx2);
itxfm_func(idct, idct, 32, avx2);
#undef itxfm_func
@@ -212,11 +212,11 @@ ipred_func(32, tm, avx2);
#undef ipred_dir_tm_funcs
#undef ipred_dc_funcs
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags;
if (bpp == 10) {
@@ -391,6 +391,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
if (ARCH_X86_64) {
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
init_subpel3_32_64(0, put, 8, avx2);
init_subpel3_32_64(1, avg, 8, avx2);
@@ -406,5 +412,5 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
#undef init_subpel2
#undef init_subpel3
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c
index eb67499c9..60d10a12a 100644
--- a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -27,7 +27,7 @@
#include "libavcodec/vp9dsp.h"
#include "libavcodec/x86/vp9dsp_init.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
decl_fpel_func(put, 8, , mmx);
decl_fpel_func(avg, 8, _16, mmxext);
@@ -51,6 +51,10 @@ decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl, 16, 16, avx2);
+decl_ipred_fn(dl, 32, 16, avx2);
+decl_ipred_fn(dr, 16, 16, avx2);
+decl_ipred_fn(dr, 32, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
@@ -63,11 +67,11 @@ decl_ipred_dir_funcs(vl);
decl_ipred_dir_funcs(vr);
decl_ipred_dir_funcs(hu);
decl_ipred_dir_funcs(hd);
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
@@ -133,7 +137,13 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, avx2);
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
+ init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
+#if ARCH_X86_64
+ init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
+#endif
}
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
}
diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 4840b2844..b56afc7f5 100644
--- a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -27,7 +27,7 @@
#include "libavcodec/vp9dsp.h"
#include "libavcodec/x86/vp9dsp_init.h"
-#if HAVE_YASM
+#if HAVE_X86ASM
extern const int16_t ff_filters_16bpp[3][15][4][16];
@@ -137,11 +137,11 @@ decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
decl_itxfm_funcs(8, BPC, sse2);
decl_itxfm_funcs(16, BPC, sse2);
decl_itxfm_func(idct, idct, 32, BPC, sse2);
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
{
-#if HAVE_YASM
+#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
@@ -234,7 +234,7 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
#endif
}
-#endif /* HAVE_YASM */
+#endif /* HAVE_X86ASM */
ff_vp9dsp_init_16bpp_x86(dsp);
}
diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
index c0ac16d3e..32b698243 100644
--- a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -847,6 +847,108 @@ DL_FUNCS
INIT_XMM avx
DL_FUNCS
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm1, [aq+30] ; pppppppp
+ vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
+ vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
+ vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
+ LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+
+.loop:
+ mova [dstq+strideq*0], m0
+ vpalignr m3, m2, m0, 2
+ vpalignr m4, m2, m0, 4
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 6
+ vpalignr m4, m2, m0, 8
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m4
+ vpalignr m3, m2, m0, 10
+ vpalignr m4, m2, m0, 12
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 14
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova m0, m2
+ vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
+ mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
+ vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
+ vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
+ vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
+ vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
+ LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
+ vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
+ vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
+ vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
+ LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
+ vperm2i128 m2, m1, m4, q0201 ; Z......555555555
+ vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+
+.loop:
+ mova [dstq+strideq*0 + 0], m0
+ mova [dstq+strideq*0 +32], m1
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m2, m1, 2
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m2, m1, 4
+ mova [dstq+strideq*2 + 0], m3
+ mova [dstq+strideq*2 +32], m4
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m2, m1, 6
+ mova [dstq+stride3q*1+ 0], m3
+ mova [dstq+stride3q*1+32], m4
+ lea dstq, [dstq+strideq*4]
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m2, m1, 8
+ mova [dstq+strideq*0 + 0], m3
+ mova [dstq+strideq*0 +32], m4
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m2, m1, 10
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m2, m1, 12
+ mova [dstq+strideq*2+ 0], m3
+ mova [dstq+strideq*2+32], m4
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m2, m1, 14
+ mova [dstq+stride3q+ 0], m3
+ mova [dstq+stride3q+ 32], m4
+ vpalignr m3, m5, m0, 16
+ vpalignr m4, m2, m1, 16
+ vperm2i128 m5, m3, m4, q0201
+ vperm2i128 m2, m4, m4, q0101
+ mova m0, m3
+ mova m1, m4
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
movh m0, [lq] ; wxyz....
@@ -1068,6 +1170,161 @@ DR_FUNCS 2
INIT_XMM avx
DR_FUNCS 2
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+ mova m0, [lq] ; klmnopqrstuvwxyz
+ movu m1, [aq-2] ; *abcdefghijklmno
+ mova m2, [aq] ; abcdefghijklmnop
+ vperm2i128 m4, m2, m2, q2001 ; ijklmnop........
+ vpalignr m5, m4, m2, 2 ; bcdefghijklmnop.
+ vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg
+ LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO.
+ vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz*
+ vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ#
+ vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH
+ DEFINE_ARGS dst, stride, stride3, stride5, dst3
+ lea dst3q, [dstq+strideq*4]
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m1, m5, 2
+ mova [dst3q+stride5q*2], m3 ; 14
+ mova [ dstq+stride3q*2], m4 ; 6
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m1, m5, 4
+ sub dst3q, strideq
+ mova [dst3q+stride5q*2], m3 ; 13
+ mova [dst3q+strideq*2 ], m4 ; 5
+ mova [dst3q+stride3q*4], m0 ; 15
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m1, m5, 6
+ mova [dstq+stride3q*4], m3 ; 12
+ mova [dst3q+strideq*1], m4 ; 4
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m1, m5, 8
+ mova [dst3q+strideq*8], m3 ; 11
+ mova [dst3q+strideq*0], m4 ; 3
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m1, m5, 10
+ mova [dstq+stride5q*2], m3 ; 10
+ mova [dstq+strideq*2 ], m4 ; 2
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m1, m5, 12
+ mova [dst3q+stride3q*2], m3 ; 9
+ mova [dstq+strideq*1 ], m4 ; 1
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m1, m5, 14
+ mova [dstq+strideq*8], m3 ; 8
+ mova [dstq+strideq*0], m4 ; 0
+ mova [dst3q+strideq*4], m5 ; 7
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
+ mova m0, [lq+mmsize*0+0] ; l[0-15]
+ mova m1, [lq+mmsize*1+0] ; l[16-31]
+ movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno
+ mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop
+ mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345
+ vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0
+ vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01
+ vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012
+ LOWPASS 0, 6, 7 ; L[0-15]
+ vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg
+ vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz*
+ vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 1, 5, 6 ; L[16-31]#
+ vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx
+ vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq
+ LOWPASS 2, 3, 6 ; A[0-15]
+ movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234
+ vperm2i128 m6, m4, m4, q2001 ; yz012345........
+ vpalignr m7, m6, m4, 2 ; rstuvwxyz012345.
+ LOWPASS 3, 4, 7 ; A[16-31].
+ vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH
+ vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23]
+ vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX
+ DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+ lea stride7q, [strideq*4+stride3q]
+ lea dst24q, [dst8q+stride3q*8]
+ lea dst8q, [dst8q+strideq*8]
+ mov cntd, 2
+
+.loop:
+ mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7
+ mova [dst24q+stride7q+32], m1
+ mova [dst8q+stride7q+0], m1
+ mova [dst8q+stride7q+32], m2
+ vpalignr m6, m4, m1, 2
+ vpalignr m7, m5, m0, 2
+ vpalignr m9, m8, m2, 2
+ mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
+ mova [dst24q+stride3q*2+32], m6
+ mova [dst8q+stride3q*2+0], m6
+ mova [dst8q+stride3q*2+32], m9
+ vpalignr m6, m4, m1, 4
+ vpalignr m7, m5, m0, 4
+ vpalignr m9, m8, m2, 4
+ mova [dst24q+stride5q+0], m7 ; 29 21 13 5
+ mova [dst24q+stride5q+32], m6
+ mova [dst8q+stride5q+0], m6
+ mova [dst8q+stride5q+32], m9
+ vpalignr m6, m4, m1, 6
+ vpalignr m7, m5, m0, 6
+ vpalignr m9, m8, m2, 6
+ mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4
+ mova [dst24q+strideq*4+32], m6
+ mova [dst8q+strideq*4+0], m6
+ mova [dst8q+strideq*4+32], m9
+ vpalignr m6, m4, m1, 8
+ vpalignr m7, m5, m0, 8
+ vpalignr m9, m8, m2, 8
+ mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3
+ mova [dst24q+stride3q+32], m6
+ mova [dst8q+stride3q+0], m6
+ mova [dst8q+stride3q+32], m9
+ vpalignr m6, m4, m1, 10
+ vpalignr m7, m5, m0, 10
+ vpalignr m9, m8, m2, 10
+ mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2
+ mova [dst24q+strideq*2+32], m6
+ mova [dst8q+strideq*2+0], m6
+ mova [dst8q+strideq*2+32], m9
+ vpalignr m6, m4, m1, 12
+ vpalignr m7, m5, m0, 12
+ vpalignr m9, m8, m2, 12
+ mova [dst24q+strideq+0 ], m7 ; 25 17 9 1
+ mova [dst24q+strideq+32], m6
+ mova [dst8q+strideq+0], m6
+ mova [dst8q+strideq+32], m9
+ vpalignr m6, m4, m1, 14
+ vpalignr m7, m5, m0, 14
+ vpalignr m9, m8, m2, 14
+ mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0
+ mova [dst24q+strideq*0+32], m6
+ mova [dst8q+strideq*0+0], m6
+ mova [dst8q+strideq*0+32], m9
+ mova m0, m5
+ mova m5, m1
+ mova m1, m4
+ mova m4, m2
+ mova m2, m8
+ mova m8, m3
+ sub dst24q, stride7q
+ sub dst24q, strideq
+ sub dst8q, stride7q
+ sub dst8q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endif
+
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp
diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm.asm b/media/ffvpx/libavcodec/x86/vp9itxfm.asm
index 57d6d353b..2c63fe514 100644
--- a/media/ffvpx/libavcodec/x86/vp9itxfm.asm
+++ b/media/ffvpx/libavcodec/x86/vp9itxfm.asm
@@ -1581,33 +1581,30 @@ cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
VP9_IDCT16_YMM_1D
mova [blockq+224], m7
- mova [blockq+480], m15
- pxor m15, m15
; store
- VP9_IDCT8_WRITEx2 0, 1, 6, 7, 15, [pw_512], 6
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 2, 3, 6, 7, 15, [pw_512], 6
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 4, 5, 6, 7, 15, [pw_512], 6
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
mova m6, [blockq+192]
mova m7, [blockq+224]
- SWAP 0, 15
- mova m15, [blockq+480]
- VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, [pw_512], 6
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 8, 9, 1, 2, 0, [pw_512], 6
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, [pw_512], 6
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, [pw_512], 6
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
- VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, [pw_512], 6
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
lea dstq, [dstq+2*strideq]
; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients
+ pxor m0, m0
ZERO_BLOCK blockq, 32, 16, m0
RET
%endif
@@ -1987,6 +1984,173 @@ IADST16_FN idct, IDCT16, iadst, IADST16, avx
IADST16_FN iadst, IADST16, idct, IDCT16, avx
IADST16_FN iadst, IADST16, iadst, IADST16, avx
+; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
+; out: m[0-15] except m6, which is in [blockq+192]
+; uses blockq as scratch space
+%macro VP9_IADST16_YMM_1D 0
+ mova [blockq+ 32], m3
+ mova [blockq+ 64], m7
+ mova [blockq+ 96], m8
+
+ ; first half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d]
+ VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w]
+ VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d]
+ VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w]
+ VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w]
+
+ ; half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w]
+ VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w]
+
+ SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6
+ SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7
+
+ mova m0, [blockq+ 0]
+ mova m4, [blockq+128]
+ mova m3, [blockq+ 32]
+ mova m7, [blockq+ 64]
+ mova m8, [blockq+ 96]
+ mova [blockq+ 0], m1
+ mova [blockq+128], m14
+ mova [blockq+ 32], m6
+ mova [blockq+ 64], m9
+ mova [blockq+ 96], m10
+
+ ; second half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d]
+ VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w]
+ VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d]
+ VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w]
+ VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w]
+
+ ; second half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d]
+ VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w]
+ VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w]
+
+ SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4
+ SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5
+
+ mova m10, [blockq+ 96]
+ mova [blockq+ 96], m12
+
+ ; round 3
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w]
+ VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192]
+ PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d]
+ VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192]
+ PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w]
+
+ mova m1, [blockq+ 0]
+ mova m14, [blockq+128]
+ mova m6, [blockq+ 32]
+ mova m9, [blockq+ 64]
+ mova m12, [blockq+ 96]
+ mova [blockq+ 0], m10
+ mova [blockq+128], m5
+
+ SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a
+ SUMSUB_BA w, 1, 3, 5
+ PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a
+
+ SUMSUB_BA w, 9, 11, 5
+ PSIGNW m9, [pw_m1] ; m9=out1, m11=t10
+ SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11
+
+ VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6
+ mova m5, [blockq+128]
+ mova [blockq+192], m11
+ PSIGNW m15, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
+
+ PSIGNW m3, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
+ VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
+
+ mova m10, [blockq+ 0]
+
+ SWAP 0, 14, 6, 11, 8, 12, 10
+ SWAP 1, 9, 15, 4, 7, 3, 5
+ SWAP 5, 9, 15
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+%macro IADST16_YMM_FN 4
+INIT_YMM avx2
+cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
+ mova m1, [blockq+ 32]
+ mova m2, [blockq+ 64]
+ mova m3, [blockq+ 96]
+ mova m5, [blockq+160]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ mova m8, [blockq+256]
+ mova m9, [blockq+288]
+ mova m10, [blockq+320]
+ mova m11, [blockq+352]
+ mova m12, [blockq+384]
+ mova m13, [blockq+416]
+ mova m14, [blockq+448]
+ mova m15, [blockq+480]
+
+ VP9_%2_YMM_1D
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ [blockq+192], [blockq+128], 1
+ mova [blockq+ 0], m0
+ VP9_%4_YMM_1D
+
+ mova [blockq+224], m7
+
+ ; store
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ pxor m0, m0
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_YMM_FN idct, IDCT16, iadst, IADST16
+IADST16_YMM_FN iadst, IADST16, idct, IDCT16
+IADST16_YMM_FN iadst, IADST16, iadst, IADST16
+%endif
+
;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;---------------------------------------------------------------------------------------------
diff --git a/media/ffvpx/libavcodec/x86/vp9mc.asm b/media/ffvpx/libavcodec/x86/vp9mc.asm
index 9152ba541..f64161b2c 100644
--- a/media/ffvpx/libavcodec/x86/vp9mc.asm
+++ b/media/ffvpx/libavcodec/x86/vp9mc.asm
@@ -1,5 +1,5 @@
;******************************************************************************
-;* VP9 MC SIMD optimizations
+;* VP9 motion compensation SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
@@ -440,9 +440,8 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f
mova m10, [filteryq+96]
%endif
.loop:
- ; FIXME maybe reuse loads from previous rows, or just
- ; more generally unroll this to prevent multiple loads of
- ; the same data?
+ ; FIXME maybe reuse loads from previous rows, or just more generally
+ ; unroll this to prevent multiple loads of the same data?
movh m0, [srcq]
movh m1, [srcq+sstrideq]
movh m2, [srcq+sstrideq*2]