From 8b37a1bc306c1d5a3cc92e9dc04fb95d5d9a0298 Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 26 Apr 2018 16:49:15 -0500 Subject: [ffvpx] Update ffvp9/ffvp8 to 3.4.2-release Structure of code was slightly modified so that it should be no longer necessary to re-generate the config_*.h files, greatly simplifying the resync process in the future. --- media/ffvpx/libavcodec/x86/constants.h | 1 + media/ffvpx/libavcodec/x86/flacdsp_init.c | 4 +- media/ffvpx/libavcodec/x86/h264_i386.h | 212 ----------------- media/ffvpx/libavcodec/x86/h264_intrapred.asm | 102 +++++--- .../ffvpx/libavcodec/x86/h264_intrapred_10bit.asm | 77 +++--- media/ffvpx/libavcodec/x86/h264_intrapred_init.c | 7 + media/ffvpx/libavcodec/x86/videodsp.asm | 2 +- media/ffvpx/libavcodec/x86/videodsp_init.c | 8 +- media/ffvpx/libavcodec/x86/vp8dsp.asm | 31 ++- media/ffvpx/libavcodec/x86/vp8dsp_init.c | 21 +- media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm | 6 +- media/ffvpx/libavcodec/x86/vp9dsp_init.c | 16 +- media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c | 18 +- .../libavcodec/x86/vp9dsp_init_16bpp_template.c | 8 +- media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm | 257 +++++++++++++++++++++ media/ffvpx/libavcodec/x86/vp9itxfm.asm | 188 ++++++++++++++- media/ffvpx/libavcodec/x86/vp9mc.asm | 7 +- 17 files changed, 632 insertions(+), 333 deletions(-) delete mode 100644 media/ffvpx/libavcodec/x86/h264_i386.h (limited to 'media/ffvpx/libavcodec/x86') diff --git a/media/ffvpx/libavcodec/x86/constants.h b/media/ffvpx/libavcodec/x86/constants.h index b82aef9a4..bbb0ef844 100644 --- a/media/ffvpx/libavcodec/x86/constants.h +++ b/media/ffvpx/libavcodec/x86/constants.h @@ -43,6 +43,7 @@ extern const xmm_reg ff_pw_64; extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; extern const ymm_reg ff_pw_255; +extern const ymm_reg ff_pw_256; extern const ymm_reg ff_pw_512; extern const ymm_reg ff_pw_1023; extern const ymm_reg ff_pw_1024; diff --git a/media/ffvpx/libavcodec/x86/flacdsp_init.c b/media/ffvpx/libavcodec/x86/flacdsp_init.c index e28c5c932..1971f81b8 100644 --- a/media/ffvpx/libavcodec/x86/flacdsp_init.c +++ b/media/ffvpx/libavcodec/x86/flacdsp_init.c @@ -53,7 +53,7 @@ DECORRELATE_FUNCS(32, avx); av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); #if CONFIG_FLAC_DECODER @@ -111,5 +111,5 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int c->lpc16_encode = ff_flac_enc_lpc_16_sse4; } #endif -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } diff --git a/media/ffvpx/libavcodec/x86/h264_i386.h b/media/ffvpx/libavcodec/x86/h264_i386.h deleted file mode 100644 index 19cd12838..000000000 --- a/media/ffvpx/libavcodec/x86/h264_i386.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG-4 part10 codec. - * non-MMX i386-specific optimizations for H.264 - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_X86_H264_I386_H -#define AVCODEC_X86_H264_I386_H - -#include - -#include "libavcodec/cabac.h" -#include "cabac.h" - -#if HAVE_INLINE_ASM - -#if ARCH_X86_64 -#define REG64 "r" -#else -#define REG64 "m" -#endif - -//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet -//as that would make optimization work hard) -#if HAVE_7REGS && !BROKEN_COMPILER -#define decode_significance decode_significance_x86 -static int decode_significance_x86(CABACContext *c, int max_coeff, - uint8_t *significant_coeff_ctx_base, - int *index, x86_reg last_off){ - void *end= significant_coeff_ctx_base + max_coeff - 1; - int minusstart= -(intptr_t)significant_coeff_ctx_base; - int minusindex= 4-(intptr_t)index; - int bit; - x86_reg coeff_count; - -#ifdef BROKEN_RELOCATIONS - void *tables; - - __asm__ volatile( - "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" - : "=&r"(tables) - : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables) - ); -#endif - - __asm__ volatile( - "3: \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c11(%6)", "%c12(%6)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%13") - - "test $1, %4 \n\t" - " jz 4f \n\t" - "add %10, %1 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c11(%6)", "%c12(%6)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%13") - - "sub %10, %1 \n\t" - "mov %2, %0 \n\t" - "movl %7, %%ecx \n\t" - "add %1, %%"FF_REG_c" \n\t" - "movl %%ecx, (%0) \n\t" - - "test $1, %4 \n\t" - " jnz 5f \n\t" - - "add"FF_OPSIZE" $4, %2 \n\t" - - "4: \n\t" - "add $1, %1 \n\t" - "cmp %8, %1 \n\t" - " jb 3b \n\t" - "mov %2, %0 \n\t" - "movl %7, %%ecx \n\t" - "add %1, %%"FF_REG_c" \n\t" - "movl %%ecx, (%0) \n\t" - "5: \n\t" - "add %9, %k0 \n\t" - "shr $2, %k0 \n\t" - : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index), - "+&r"(c->low), "=&r"(bit), "+&r"(c->range) - : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)) - TABLES_ARG - : "%"FF_REG_c, "memory" - ); - return coeff_count; -} - -#define decode_significance_8x8 decode_significance_8x8_x86 -static int decode_significance_8x8_x86(CABACContext *c, - uint8_t *significant_coeff_ctx_base, - int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){ - int minusindex= 4-(intptr_t)index; - int bit; - x86_reg coeff_count; - x86_reg last=0; - x86_reg state; - -#ifdef BROKEN_RELOCATIONS - void *tables; - - __asm__ volatile( - "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" - : "=&r"(tables) - : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables) - ); -#endif - - __asm__ volatile( - "mov %1, %6 \n\t" - "3: \n\t" - - "mov %10, %0 \n\t" - "movzb (%0, %6), %6 \n\t" - "add %9, %6 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c12(%7)", "%c13(%7)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%15") - - "mov %1, %6 \n\t" - "test $1, %4 \n\t" - " jz 4f \n\t" - -#ifdef BROKEN_RELOCATIONS - "movzb %c14(%15, %q6), %6\n\t" -#else - "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t" -#endif - "add %11, %6 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c12(%7)", "%c13(%7)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%15") - - "mov %2, %0 \n\t" - "mov %1, %6 \n\t" - "mov %k6, (%0) \n\t" - - "test $1, %4 \n\t" - " jnz 5f \n\t" - - "add"FF_OPSIZE" $4, %2 \n\t" - - "4: \n\t" - "add $1, %6 \n\t" - "mov %6, %1 \n\t" - "cmp $63, %6 \n\t" - " jb 3b \n\t" - "mov %2, %0 \n\t" - "mov %k6, (%0) \n\t" - "5: \n\t" - "addl %8, %k0 \n\t" - "shr $2, %k0 \n\t" - : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low), - "=&r"(bit), "+&r"(c->range), "=&r"(state) - : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), - REG64(sig_off), REG64(last_coeff_ctx_base), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)), - "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG - : "%"FF_REG_c, "memory" - ); - return coeff_count; -} -#endif /* HAVE_7REGS && BROKEN_COMPILER */ - -#endif /* HAVE_INLINE_ASM */ -#endif /* AVCODEC_X86_H264_I386_H */ diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred.asm b/media/ffvpx/libavcodec/x86/h264_intrapred.asm index c88d91b49..f3aa3172f 100644 --- a/media/ffvpx/libavcodec/x86/h264_intrapred.asm +++ b/media/ffvpx/libavcodec/x86/h264_intrapred.asm @@ -49,7 +49,7 @@ cextern pw_17 cextern pw_32 ;----------------------------------------------------------------------------- -; void ff_pred16x16_vertical_8(uint8_t *src, int stride) +; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmx @@ -85,7 +85,7 @@ cglobal pred16x16_vertical_8, 2,3 REP_RET ;----------------------------------------------------------------------------- -; void ff_pred16x16_horizontal_8(uint8_t *src, int stride) +; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_H 0 @@ -126,7 +126,7 @@ INIT_XMM ssse3 PRED16x16_H ;----------------------------------------------------------------------------- -; void ff_pred16x16_dc_8(uint8_t *src, int stride) +; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 0 @@ -188,7 +188,7 @@ INIT_XMM ssse3 PRED16x16_DC ;----------------------------------------------------------------------------- -; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride) +; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_TM 0 @@ -268,8 +268,45 @@ cglobal pred16x16_tm_vp8_8, 2,6,6 jg .loop REP_RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration + sub dstq, strideq + pmovzxbw m0, [dstq] + vpbroadcastb xm1, [r0-1] + pmovzxbw m1, xm1 + psubw m0, m1 + mov iterationd, 4 + lea stride3q, [strideq*3] +.loop: + vpbroadcastb xm1, [dstq+strideq*1-1] + vpbroadcastb xm2, [dstq+strideq*2-1] + vpbroadcastb xm3, [dstq+stride3q-1] + vpbroadcastb xm4, [dstq+strideq*4-1] + pmovzxbw m1, xm1 + pmovzxbw m2, xm2 + pmovzxbw m3, xm3 + pmovzxbw m4, xm4 + paddw m1, m0 + paddw m2, m0 + paddw m3, m0 + paddw m4, m0 + vpackuswb m1, m1, m2 + vpackuswb m3, m3, m4 + vpermq m1, m1, q3120 + vpermq m3, m3, q3120 + movdqa [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m1, 1 + movdqa [dstq+stride3q*1], xm3 + vextracti128 [dstq+strideq*4], m3, 1 + lea dstq, [dstq+strideq*4] + dec iterationd + jg .loop + REP_RET +%endif + ;----------------------------------------------------------------------------- -; void ff_pred16x16_plane_*_8(uint8_t *src, int stride) +; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro H264_PRED16x16_PLANE 1 @@ -550,7 +587,7 @@ H264_PRED16x16_PLANE rv40 H264_PRED16x16_PLANE svq3 ;----------------------------------------------------------------------------- -; void ff_pred8x8_plane_8(uint8_t *src, int stride) +; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro H264_PRED8x8_PLANE 0 @@ -724,7 +761,7 @@ INIT_XMM ssse3 H264_PRED8x8_PLANE ;----------------------------------------------------------------------------- -; void ff_pred8x8_vertical_8(uint8_t *src, int stride) +; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmx @@ -741,7 +778,7 @@ cglobal pred8x8_vertical_8, 2,2 RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_horizontal_8(uint8_t *src, int stride) +; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8_H 0 @@ -769,7 +806,7 @@ INIT_MMX ssse3 PRED8x8_H ;----------------------------------------------------------------------------- -; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride) +; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext cglobal pred8x8_top_dc_8, 2,5 @@ -803,7 +840,7 @@ cglobal pred8x8_top_dc_8, 2,5 RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride) +; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -864,7 +901,7 @@ cglobal pred8x8_dc_8, 2,5 RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride) +; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -901,7 +938,7 @@ cglobal pred8x8_dc_rv40_8, 2,7 REP_RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride) +; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8_TM 0 @@ -1014,7 +1051,7 @@ cglobal pred8x8_tm_vp8_8, 2,3,6 ;----------------------------------------------------------------------------- ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_TOP_DC 0 cglobal pred8x8l_top_dc_8, 4,4 @@ -1070,7 +1107,7 @@ PRED8x8L_TOP_DC ;----------------------------------------------------------------------------- ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DC 0 @@ -1174,7 +1211,7 @@ PRED8x8L_DC ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL 0 @@ -1246,7 +1283,7 @@ PRED8x8L_HORIZONTAL ;----------------------------------------------------------------------------- ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL 0 @@ -1297,7 +1334,7 @@ PRED8x8L_VERTICAL ;----------------------------------------------------------------------------- ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -1498,7 +1535,7 @@ PRED8x8L_DOWN_LEFT ;----------------------------------------------------------------------------- ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -1750,7 +1787,7 @@ PRED8x8L_DOWN_RIGHT ;----------------------------------------------------------------------------- ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -1978,7 +2015,7 @@ PRED8x8L_VERTICAL_RIGHT ;----------------------------------------------------------------------------- ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL_LEFT 0 @@ -2068,7 +2105,7 @@ PRED8x8L_VERTICAL_LEFT ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL_UP 0 @@ -2156,7 +2193,7 @@ PRED8x8L_HORIZONTAL_UP ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, -; int has_topright, int stride) +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -2404,7 +2441,8 @@ INIT_MMX ssse3 PRED8x8L_HORIZONTAL_DOWN ;------------------------------------------------------------------------------- -; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) +; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) ;------------------------------------------------------------------------------- INIT_MMX mmxext @@ -2435,7 +2473,7 @@ cglobal pred4x4_dc_8, 3,5 ;----------------------------------------------------------------------------- ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED4x4_TM 0 @@ -2514,7 +2552,7 @@ cglobal pred4x4_tm_vp8_8, 3,3 ;----------------------------------------------------------------------------- ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -2535,7 +2573,7 @@ cglobal pred4x4_vertical_vp8_8, 3,3 ;----------------------------------------------------------------------------- ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext cglobal pred4x4_down_left_8, 3,3 @@ -2562,7 +2600,7 @@ cglobal pred4x4_down_left_8, 3,3 ;------------------------------------------------------------------------------ ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;------------------------------------------------------------------------------ INIT_MMX mmxext @@ -2588,7 +2626,7 @@ cglobal pred4x4_vertical_left_8, 3,3 ;------------------------------------------------------------------------------ ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;------------------------------------------------------------------------------ INIT_MMX mmxext @@ -2622,7 +2660,8 @@ cglobal pred4x4_horizontal_up_8, 3,3 ;------------------------------------------------------------------------------ ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, -; const uint8_t *topright, int stride) +; const uint8_t *topright, +; ptrdiff_t stride) ;------------------------------------------------------------------------------ INIT_MMX mmxext @@ -2658,7 +2697,8 @@ cglobal pred4x4_horizontal_down_8, 3,3 ;----------------------------------------------------------------------------- ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, -; const uint8_t *topright, int stride) +; const uint8_t *topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -2689,7 +2729,7 @@ cglobal pred4x4_vertical_right_8, 3,3 ;----------------------------------------------------------------------------- ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, -; int stride) +; ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm index 9e40cfe24..629e0a72e 100644 --- a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm @@ -51,7 +51,8 @@ SECTION .text %endmacro ;----------------------------------------------------------------------------- -; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED4x4_DR 0 cglobal pred4x4_down_right_10, 3, 3 @@ -89,7 +90,8 @@ PRED4x4_DR %endif ;------------------------------------------------------------------------------ -; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;------------------------------------------------------------------------------ %macro PRED4x4_VR 0 cglobal pred4x4_vertical_right_10, 3, 3, 6 @@ -128,7 +130,8 @@ PRED4x4_VR %endif ;------------------------------------------------------------------------------- -; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;------------------------------------------------------------------------------- %macro PRED4x4_HD 0 cglobal pred4x4_horizontal_down_10, 3, 3 @@ -170,7 +173,7 @@ PRED4x4_HD %endif ;----------------------------------------------------------------------------- -; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext @@ -195,7 +198,8 @@ cglobal pred4x4_dc_10, 3, 3 RET ;----------------------------------------------------------------------------- -; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED4x4_DL 0 cglobal pred4x4_down_left_10, 3, 3 @@ -225,7 +229,8 @@ PRED4x4_DL %endif ;----------------------------------------------------------------------------- -; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED4x4_VL 0 cglobal pred4x4_vertical_left_10, 3, 3 @@ -254,7 +259,8 @@ PRED4x4_VL %endif ;----------------------------------------------------------------------------- -; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) +; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_MMX mmxext cglobal pred4x4_horizontal_up_10, 3, 3 @@ -288,7 +294,7 @@ cglobal pred4x4_horizontal_up_10, 3, 3 ;----------------------------------------------------------------------------- -; void ff_pred8x8_vertical(pixel *src, int stride) +; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal pred8x8_vertical_10, 2, 2 @@ -304,7 +310,7 @@ cglobal pred8x8_vertical_10, 2, 2 RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_horizontal(pixel *src, int stride) +; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal pred8x8_horizontal_10, 2, 3 @@ -324,7 +330,7 @@ cglobal pred8x8_horizontal_10, 2, 3 REP_RET ;----------------------------------------------------------------------------- -; void ff_predict_8x8_dc(pixel *src, int stride) +; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro MOV8 2-3 ; sort of a hack, but it works @@ -411,7 +417,7 @@ INIT_XMM sse2 PRED8x8_DC pshuflw ;----------------------------------------------------------------------------- -; void ff_pred8x8_top_dc(pixel *src, int stride) +; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal pred8x8_top_dc_10, 2, 4 @@ -438,7 +444,7 @@ cglobal pred8x8_top_dc_10, 2, 4 RET ;----------------------------------------------------------------------------- -; void ff_pred8x8_plane(pixel *src, int stride) +; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal pred8x8_plane_10, 2, 7, 7 @@ -501,8 +507,8 @@ cglobal pred8x8_plane_10, 2, 7, 7 ;----------------------------------------------------------------------------- -; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_128_DC 0 cglobal pred8x8l_128_dc_10, 4, 4 @@ -526,8 +532,8 @@ INIT_XMM sse2 PRED8x8L_128_DC ;----------------------------------------------------------------------------- -; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_TOP_DC 0 cglobal pred8x8l_top_dc_10, 4, 4, 6 @@ -566,7 +572,8 @@ PRED8x8L_TOP_DC %endif ;------------------------------------------------------------------------------- -; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) +; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) ;------------------------------------------------------------------------------- ;TODO: see if scalar is faster %macro PRED8x8L_DC 0 @@ -625,8 +632,8 @@ PRED8x8L_DC %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL 0 cglobal pred8x8l_vertical_10, 4, 4, 6 @@ -661,8 +668,8 @@ PRED8x8L_VERTICAL %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL 0 cglobal pred8x8l_horizontal_10, 4, 4, 5 @@ -718,8 +725,8 @@ PRED8x8L_HORIZONTAL %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_LEFT 0 cglobal pred8x8l_down_left_10, 4, 4, 7 @@ -787,8 +794,8 @@ PRED8x8L_DOWN_LEFT %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, -; int stride) +; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_RIGHT 0 ; standard forbids this when has_topleft is false @@ -862,8 +869,8 @@ PRED8x8L_DOWN_RIGHT %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft, -; int has_topright, int stride) +; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL_RIGHT 0 ; likewise with 8x8l_down_right @@ -933,8 +940,8 @@ PRED8x8L_VERTICAL_RIGHT %endif ;----------------------------------------------------------------------------- -; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft, -; int has_topright, int stride) +; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL_UP 0 cglobal pred8x8l_horizontal_up_10, 4, 4, 6 @@ -996,7 +1003,7 @@ PRED8x8L_HORIZONTAL_UP ;----------------------------------------------------------------------------- -; void ff_pred16x16_vertical(pixel *src, int stride) +; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro MOV16 3-5 mova [%1+ 0], %2 @@ -1032,7 +1039,7 @@ INIT_XMM sse2 PRED16x16_VERTICAL ;----------------------------------------------------------------------------- -; void ff_pred16x16_horizontal(pixel *src, int stride) +; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_HORIZONTAL 0 cglobal pred16x16_horizontal_10, 2, 3 @@ -1056,7 +1063,7 @@ INIT_XMM sse2 PRED16x16_HORIZONTAL ;----------------------------------------------------------------------------- -; void ff_pred16x16_dc(pixel *src, int stride) +; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 0 cglobal pred16x16_dc_10, 2, 6 @@ -1102,7 +1109,7 @@ INIT_XMM sse2 PRED16x16_DC ;----------------------------------------------------------------------------- -; void ff_pred16x16_top_dc(pixel *src, int stride) +; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_TOP_DC 0 cglobal pred16x16_top_dc_10, 2, 3 @@ -1134,7 +1141,7 @@ INIT_XMM sse2 PRED16x16_TOP_DC ;----------------------------------------------------------------------------- -; void ff_pred16x16_left_dc(pixel *src, int stride) +; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_LEFT_DC 0 cglobal pred16x16_left_dc_10, 2, 6 @@ -1171,7 +1178,7 @@ INIT_XMM sse2 PRED16x16_LEFT_DC ;----------------------------------------------------------------------------- -; void ff_pred16x16_128_dc(pixel *src, int stride) +; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) ;----------------------------------------------------------------------------- %macro PRED16x16_128_DC 0 cglobal pred16x16_128_dc_10, 2,3 diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c index 528b92e49..bdd5125d6 100644 --- a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c @@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3) PRED16x16(tm_vp8, 8, mmx) PRED16x16(tm_vp8, 8, mmxext) PRED16x16(tm_vp8, 8, sse2) +PRED16x16(tm_vp8, 8, avx2) PRED8x8(top_dc, 8, mmxext) PRED8x8(dc_rv40, 8, mmxext) @@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } } } + + if(EXTERNAL_AVX2(cpu_flags)){ + if (codec_id == AV_CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2; + } + } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; diff --git a/media/ffvpx/libavcodec/x86/videodsp.asm b/media/ffvpx/libavcodec/x86/videodsp.asm index a807d3b88..e23786070 100644 --- a/media/ffvpx/libavcodec/x86/videodsp.asm +++ b/media/ffvpx/libavcodec/x86/videodsp.asm @@ -114,7 +114,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 - cmp wq, -mmsize/2 ; } while (w > $mmsize/2) + cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) jl .x_loop movu [dstq-mmsize], m0 ; write($reg, $mmsize) add dstq, dst_strideq ; dst += dst_stride diff --git a/media/ffvpx/libavcodec/x86/videodsp_init.c b/media/ffvpx/libavcodec/x86/videodsp_init.c index 26e072bb1..eeebb4154 100644 --- a/media/ffvpx/libavcodec/x86/videodsp_init.c +++ b/media/ffvpx/libavcodec/x86/videodsp_init.c @@ -29,7 +29,7 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/videodsp.h" -#if HAVE_YASM +#if HAVE_X86ASM typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, const uint8_t *src, x86_reg src_stride, x86_reg start_y, x86_reg end_y, x86_reg bh); @@ -271,14 +271,14 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, hfixtbl_avx2, &ff_emu_edge_hvar_avx2); } #endif /* HAVE_AVX2_EXTERNAL */ -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h); av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 @@ -305,5 +305,5 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) ctx->emulated_edge_mc = emulated_edge_mc_avx2; } #endif -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } diff --git a/media/ffvpx/libavcodec/x86/vp8dsp.asm b/media/ffvpx/libavcodec/x86/vp8dsp.asm index 538b3f4a9..e303b8029 100644 --- a/media/ffvpx/libavcodec/x86/vp8dsp.asm +++ b/media/ffvpx/libavcodec/x86/vp8dsp.asm @@ -156,8 +156,8 @@ SECTION .text ;------------------------------------------------------------------------------- ; subpel MC functions: ; -; void ff_put_vp8_epel_hv_(uint8_t *dst, int deststride, -; uint8_t *src, int srcstride, +; void ff_put_vp8_epel_hv_(uint8_t *dst, ptrdiff_t deststride, +; uint8_t *src, ptrdiff_t srcstride, ; int height, int mx, int my); ;------------------------------------------------------------------------------- @@ -884,7 +884,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height REP_RET ;----------------------------------------------------------------------------- -; void ff_vp8_idct_dc_add_(uint8_t *dst, int16_t block[16], int stride); +; void ff_vp8_idct_dc_add_(uint8_t *dst, int16_t block[16], ptrdiff_t stride); ;----------------------------------------------------------------------------- %macro ADD_DC 4 @@ -906,6 +906,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height %4 [dst2q+strideq+%3], m5 %endmacro +%if ARCH_X86_32 INIT_MMX mmx cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride ; load data @@ -929,8 +930,9 @@ cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, movh RET +%endif -INIT_XMM sse4 +%macro VP8_IDCT_DC_ADD 0 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data movd m0, [blockq] @@ -956,13 +958,28 @@ cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride paddw m4, m0 packuswb m2, m4 movd [dst1q], m2 +%if cpuflag(sse4) pextrd [dst1q+strideq], m2, 1 pextrd [dst2q], m2, 2 pextrd [dst2q+strideq], m2, 3 +%else + psrldq m2, 4 + movd [dst1q+strideq], m2 + psrldq m2, 4 + movd [dst2q], m2 + psrldq m2, 4 + movd [dst2q+strideq], m2 +%endif RET +%endmacro + +INIT_XMM sse2 +VP8_IDCT_DC_ADD +INIT_XMM sse4 +VP8_IDCT_DC_ADD ;----------------------------------------------------------------------------- -; void ff_vp8_idct_dc_add4y_(uint8_t *dst, int16_t block[4][16], int stride); +; void ff_vp8_idct_dc_add4y_(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); ;----------------------------------------------------------------------------- %if ARCH_X86_32 @@ -1035,7 +1052,7 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride RET ;----------------------------------------------------------------------------- -; void ff_vp8_idct_dc_add4uv_(uint8_t *dst, int16_t block[4][16], int stride); +; void ff_vp8_idct_dc_add4uv_(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); ;----------------------------------------------------------------------------- INIT_MMX mmx @@ -1077,7 +1094,7 @@ cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride RET ;----------------------------------------------------------------------------- -; void ff_vp8_idct_add_(uint8_t *dst, int16_t block[16], int stride); +; void ff_vp8_idct_add_(uint8_t *dst, int16_t block[16], ptrdiff_t stride); ;----------------------------------------------------------------------------- ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_init.c b/media/ffvpx/libavcodec/x86/vp8dsp_init.c index 897d5a0e7..397b2518c 100644 --- a/media/ffvpx/libavcodec/x86/vp8dsp_init.c +++ b/media/ffvpx/libavcodec/x86/vp8dsp_init.c @@ -26,7 +26,7 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/vp8dsp.h" -#if HAVE_YASM +#if HAVE_X86ASM /* * MC functions @@ -233,6 +233,8 @@ HVBILIN(ssse3, 8, 16, 16) void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], @@ -288,7 +290,7 @@ DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(sse4) -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ @@ -316,7 +318,7 @@ DECLARE_LOOP_FILTER(sse4) av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { @@ -346,7 +348,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { + if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -361,18 +363,18 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) VP8_BILINEAR_MC_FUNC(1, 8, ssse3); VP8_BILINEAR_MC_FUNC(2, 4, ssse3); } -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; #if ARCH_X86_32 + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; c->vp8_idct_add = ff_vp8_idct_add_mmx; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; @@ -416,7 +418,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; } - if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { + if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) { c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; @@ -427,6 +429,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) } if (EXTERNAL_SSE2(cpu_flags)) { + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; @@ -460,5 +463,5 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; } -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm index 98bb6696a..caeb40526 100644 --- a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm +++ b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm @@ -43,7 +43,7 @@ cextern pb_80 SECTION .text ;----------------------------------------------------------------------------- -; void ff_vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim); +; void ff_vp8_h/v_loop_filter_simple_(uint8_t *dst, ptrdiff_t stride, int flim); ;----------------------------------------------------------------------------- ; macro called with 7 mm register indexes as argument, and 4 regular registers @@ -429,7 +429,7 @@ INIT_XMM sse4 SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- -; void ff_vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] int stride, +; void ff_vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- @@ -921,7 +921,7 @@ INNER_LOOPFILTER v, 8 INNER_LOOPFILTER h, 8 ;----------------------------------------------------------------------------- -; void ff_vp8_h/v_loop_filter_mbedge_(uint8_t *dst, [uint8_t *v,] int stride, +; void ff_vp8_h/v_loop_filter_mbedge_(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.c b/media/ffvpx/libavcodec/x86/vp9dsp_init.c index cc781a009..837cce850 100644 --- a/media/ffvpx/libavcodec/x86/vp9dsp_init.c +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.c @@ -27,7 +27,7 @@ #include "libavcodec/vp9dsp.h" #include "libavcodec/x86/vp9dsp_init.h" -#if HAVE_YASM +#if HAVE_X86ASM decl_fpel_func(put, 4, , mmx); decl_fpel_func(put, 8, , mmx); @@ -114,7 +114,7 @@ itxfm_func(idct, idct, 32, sse2); itxfm_func(idct, idct, 32, ssse3); itxfm_func(idct, idct, 32, avx); itxfm_func(iwht, iwht, 4, mmx); -itxfm_func(idct, idct, 16, avx2); +itxfm_funcs(16, avx2); itxfm_func(idct, idct, 32, avx2); #undef itxfm_func @@ -212,11 +212,11 @@ ipred_func(32, tm, avx2); #undef ipred_dir_tm_funcs #undef ipred_dc_funcs -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags; if (bpp == 10) { @@ -391,6 +391,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (ARCH_X86_64) { #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; init_subpel3_32_64(0, put, 8, avx2); init_subpel3_32_64(1, avg, 8, avx2); @@ -406,5 +412,5 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) #undef init_subpel2 #undef init_subpel3 -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c index eb67499c9..60d10a12a 100644 --- a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c @@ -27,7 +27,7 @@ #include "libavcodec/vp9dsp.h" #include "libavcodec/x86/vp9dsp_init.h" -#if HAVE_YASM +#if HAVE_X86ASM decl_fpel_func(put, 8, , mmx); decl_fpel_func(avg, 8, _16, mmxext); @@ -51,6 +51,10 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); +decl_ipred_fn(dr, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -63,11 +67,11 @@ decl_ipred_dir_funcs(vl); decl_ipred_dir_funcs(vr); decl_ipred_dir_funcs(hu); decl_ipred_dir_funcs(hd); -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { @@ -133,7 +137,13 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 1, 32, avg, _16, avx2); init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); +#if ARCH_X86_64 + init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); +#endif } -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ } diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c index 4840b2844..b56afc7f5 100644 --- a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -27,7 +27,7 @@ #include "libavcodec/vp9dsp.h" #include "libavcodec/x86/vp9dsp_init.h" -#if HAVE_YASM +#if HAVE_X86ASM extern const int16_t ff_filters_16bpp[3][15][4][16]; @@ -137,11 +137,11 @@ decl_itxfm_func(iadst, iadst, 4, BPC, sse2); decl_itxfm_funcs(8, BPC, sse2); decl_itxfm_funcs(16, BPC, sse2); decl_itxfm_func(idct, idct, 32, BPC, sse2); -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) { -#if HAVE_YASM +#if HAVE_X86ASM int cpu_flags = av_get_cpu_flags(); #define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \ @@ -234,7 +234,7 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) #endif } -#endif /* HAVE_YASM */ +#endif /* HAVE_X86ASM */ ff_vp9dsp_init_16bpp_x86(dsp); } diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm index c0ac16d3e..32b698243 100644 --- a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm @@ -847,6 +847,108 @@ DL_FUNCS INIT_XMM avx DL_FUNCS +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] + +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a movh m0, [lq] ; wxyz.... @@ -1068,6 +1170,161 @@ DR_FUNCS 2 INIT_XMM avx DR_FUNCS 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a + mova m0, [lq] ; klmnopqrstuvwxyz + movu m1, [aq-2] ; *abcdefghijklmno + mova m2, [aq] ; abcdefghijklmnop + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH + DEFINE_ARGS dst, stride, stride3, stride5, dst3 + lea dst3q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + vpalignr m3, m5, m0, 2 + vpalignr m4, m1, m5, 2 + mova [dst3q+stride5q*2], m3 ; 14 + mova [ dstq+stride3q*2], m4 ; 6 + vpalignr m3, m5, m0, 4 + vpalignr m4, m1, m5, 4 + sub dst3q, strideq + mova [dst3q+stride5q*2], m3 ; 13 + mova [dst3q+strideq*2 ], m4 ; 5 + mova [dst3q+stride3q*4], m0 ; 15 + vpalignr m3, m5, m0, 6 + vpalignr m4, m1, m5, 6 + mova [dstq+stride3q*4], m3 ; 12 + mova [dst3q+strideq*1], m4 ; 4 + vpalignr m3, m5, m0, 8 + vpalignr m4, m1, m5, 8 + mova [dst3q+strideq*8], m3 ; 11 + mova [dst3q+strideq*0], m4 ; 3 + vpalignr m3, m5, m0, 10 + vpalignr m4, m1, m5, 10 + mova [dstq+stride5q*2], m3 ; 10 + mova [dstq+strideq*2 ], m4 ; 2 + vpalignr m3, m5, m0, 12 + vpalignr m4, m1, m5, 12 + mova [dst3q+stride3q*2], m3 ; 9 + mova [dstq+strideq*1 ], m4 ; 1 + vpalignr m3, m5, m0, 14 + vpalignr m4, m1, m5, 14 + mova [dstq+strideq*8], m3 ; 8 + mova [dstq+strideq*0], m4 ; 0 + mova [dst3q+strideq*4], m5 ; 7 + RET + +%if ARCH_X86_64 +cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a + mova m0, [lq+mmsize*0+0] ; l[0-15] + mova m1, [lq+mmsize*1+0] ; l[16-31] + movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno + mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop + mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 + vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 + vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 + vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 + LOWPASS 0, 6, 7 ; L[0-15] + vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg + vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 1, 5, 6 ; L[16-31]# + vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx + vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq + LOWPASS 2, 3, 6 ; A[0-15] + movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 + vperm2i128 m6, m4, m4, q2001 ; yz012345........ + vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. + LOWPASS 3, 4, 7 ; A[16-31]. + vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH + vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] + vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX + DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea stride7q, [strideq*4+stride3q] + lea dst24q, [dst8q+stride3q*8] + lea dst8q, [dst8q+strideq*8] + mov cntd, 2 + +.loop: + mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 + mova [dst24q+stride7q+32], m1 + mova [dst8q+stride7q+0], m1 + mova [dst8q+stride7q+32], m2 + vpalignr m6, m4, m1, 2 + vpalignr m7, m5, m0, 2 + vpalignr m9, m8, m2, 2 + mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 + mova [dst24q+stride3q*2+32], m6 + mova [dst8q+stride3q*2+0], m6 + mova [dst8q+stride3q*2+32], m9 + vpalignr m6, m4, m1, 4 + vpalignr m7, m5, m0, 4 + vpalignr m9, m8, m2, 4 + mova [dst24q+stride5q+0], m7 ; 29 21 13 5 + mova [dst24q+stride5q+32], m6 + mova [dst8q+stride5q+0], m6 + mova [dst8q+stride5q+32], m9 + vpalignr m6, m4, m1, 6 + vpalignr m7, m5, m0, 6 + vpalignr m9, m8, m2, 6 + mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 + mova [dst24q+strideq*4+32], m6 + mova [dst8q+strideq*4+0], m6 + mova [dst8q+strideq*4+32], m9 + vpalignr m6, m4, m1, 8 + vpalignr m7, m5, m0, 8 + vpalignr m9, m8, m2, 8 + mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 + mova [dst24q+stride3q+32], m6 + mova [dst8q+stride3q+0], m6 + mova [dst8q+stride3q+32], m9 + vpalignr m6, m4, m1, 10 + vpalignr m7, m5, m0, 10 + vpalignr m9, m8, m2, 10 + mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 + mova [dst24q+strideq*2+32], m6 + mova [dst8q+strideq*2+0], m6 + mova [dst8q+strideq*2+32], m9 + vpalignr m6, m4, m1, 12 + vpalignr m7, m5, m0, 12 + vpalignr m9, m8, m2, 12 + mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 + mova [dst24q+strideq+32], m6 + mova [dst8q+strideq+0], m6 + mova [dst8q+strideq+32], m9 + vpalignr m6, m4, m1, 14 + vpalignr m7, m5, m0, 14 + vpalignr m9, m8, m2, 14 + mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 + mova [dst24q+strideq*0+32], m6 + mova [dst8q+strideq*0+0], m6 + mova [dst8q+strideq*0+32], m9 + mova m0, m5 + mova m5, m1 + mova m1, m4 + mova m4, m2 + mova m2, m8 + mova m8, m3 + sub dst24q, stride7q + sub dst24q, strideq + sub dst8q, stride7q + sub dst8q, strideq + dec cntd + jg .loop + RET +%endif +%endif + %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a movifnidn aq, amp diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm.asm b/media/ffvpx/libavcodec/x86/vp9itxfm.asm index 57d6d353b..2c63fe514 100644 --- a/media/ffvpx/libavcodec/x86/vp9itxfm.asm +++ b/media/ffvpx/libavcodec/x86/vp9itxfm.asm @@ -1581,33 +1581,30 @@ cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob VP9_IDCT16_YMM_1D mova [blockq+224], m7 - mova [blockq+480], m15 - pxor m15, m15 ; store - VP9_IDCT8_WRITEx2 0, 1, 6, 7, 15, [pw_512], 6 + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 2, 3, 6, 7, 15, [pw_512], 6 + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 4, 5, 6, 7, 15, [pw_512], 6 + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] mova m6, [blockq+192] mova m7, [blockq+224] - SWAP 0, 15 - mova m15, [blockq+480] - VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, [pw_512], 6 + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 8, 9, 1, 2, 0, [pw_512], 6 + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, [pw_512], 6 + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, [pw_512], 6 + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] - VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, [pw_512], 6 + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 lea dstq, [dstq+2*strideq] ; at the end of the loop, m0 should still be zero ; use that to zero out block coefficients + pxor m0, m0 ZERO_BLOCK blockq, 32, 16, m0 RET %endif @@ -1987,6 +1984,173 @@ IADST16_FN idct, IDCT16, iadst, IADST16, avx IADST16_FN iadst, IADST16, idct, IDCT16, avx IADST16_FN iadst, IADST16, iadst, IADST16, avx +; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] +; out: m[0-15] except m6, which is in [blockq+192] +; uses blockq as scratch space +%macro VP9_IADST16_YMM_1D 0 + mova [blockq+ 32], m3 + mova [blockq+ 64], m7 + mova [blockq+ 96], m8 + + ; first half of round 1 + VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] + VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] + VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] + VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] + + VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] + VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] + VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] + + ; half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] + VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] + VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] + VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] + + SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 + SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 + + mova m0, [blockq+ 0] + mova m4, [blockq+128] + mova m3, [blockq+ 32] + mova m7, [blockq+ 64] + mova m8, [blockq+ 96] + mova [blockq+ 0], m1 + mova [blockq+128], m14 + mova [blockq+ 32], m6 + mova [blockq+ 64], m9 + mova [blockq+ 96], m10 + + ; second half of round 1 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] + VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] + VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] + VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] + + VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] + VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] + VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] + + ; second half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] + VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] + VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] + VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] + + SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 + SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 + + mova m10, [blockq+ 96] + mova [blockq+ 96], m12 + + ; round 3 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] + VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] + VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] + VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] + PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] + + VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] + VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] + PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] + + mova m1, [blockq+ 0] + mova m14, [blockq+128] + mova m6, [blockq+ 32] + mova m9, [blockq+ 64] + mova m12, [blockq+ 96] + mova [blockq+ 0], m10 + mova [blockq+128], m5 + + SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a + SUMSUB_BA w, 1, 3, 5 + PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a + + SUMSUB_BA w, 9, 11, 5 + PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 + SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 + + VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 + mova m5, [blockq+128] + mova [blockq+192], m11 + PSIGNW m15, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 + + PSIGNW m3, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 + VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 + + mova m10, [blockq+ 0] + + SWAP 0, 14, 6, 11, 8, 12, 10 + SWAP 1, 9, 15, 4, 7, 3, 5 + SWAP 5, 9, 15 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +%macro IADST16_YMM_FN 4 +INIT_YMM avx2 +cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob + mova m1, [blockq+ 32] + mova m2, [blockq+ 64] + mova m3, [blockq+ 96] + mova m5, [blockq+160] + mova m6, [blockq+192] + mova m7, [blockq+224] + mova m8, [blockq+256] + mova m9, [blockq+288] + mova m10, [blockq+320] + mova m11, [blockq+352] + mova m12, [blockq+384] + mova m13, [blockq+416] + mova m14, [blockq+448] + mova m15, [blockq+480] + + VP9_%2_YMM_1D + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + [blockq+192], [blockq+128], 1 + mova [blockq+ 0], m0 + VP9_%4_YMM_1D + + mova [blockq+224], m7 + + ; store + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + mova m6, [blockq+192] + mova m7, [blockq+224] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + pxor m0, m0 + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_YMM_FN idct, IDCT16, iadst, IADST16 +IADST16_YMM_FN iadst, IADST16, idct, IDCT16 +IADST16_YMM_FN iadst, IADST16, iadst, IADST16 +%endif + ;--------------------------------------------------------------------------------------------- ; void vp9_idct_idct_32x32_add_(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); ;--------------------------------------------------------------------------------------------- diff --git a/media/ffvpx/libavcodec/x86/vp9mc.asm b/media/ffvpx/libavcodec/x86/vp9mc.asm index 9152ba541..f64161b2c 100644 --- a/media/ffvpx/libavcodec/x86/vp9mc.asm +++ b/media/ffvpx/libavcodec/x86/vp9mc.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* VP9 MC SIMD optimizations +;* VP9 motion compensation SIMD optimizations ;* ;* Copyright (c) 2013 Ronald S. Bultje ;* @@ -440,9 +440,8 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f mova m10, [filteryq+96] %endif .loop: - ; FIXME maybe reuse loads from previous rows, or just - ; more generally unroll this to prevent multiple loads of - ; the same data? + ; FIXME maybe reuse loads from previous rows, or just more generally + ; unroll this to prevent multiple loads of the same data? movh m0, [srcq] movh m1, [srcq+sstrideq] movh m2, [srcq+sstrideq*2] -- cgit v1.2.3